import os
import math
import random
import numpy as np
import polars as pl
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20, 5]
plt.rcParams["figure.autolayout"] = True
pl.Config.set_tbl_rows(256)
import nmf.nmf as nmf
import knee.kneedle as kneedle
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Remove SciKit Learn warnings
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses
df = pl.read_excel('db.xlsx')
df = df.drop(['#'])
df
| mat0 | mat1 | abs_peak | abs_min | abs_max | em_peak | em_min | em_max | QY (%) | hopt (%) | PCE (%) |
|---|---|---|---|---|---|---|---|---|---|---|
| str | str | i64 | i64 | i64 | i64 | i64 | i64 | f64 | f64 | f64 |
| "dye" | "film" | 578 | 420 | 600 | 613 | 550 | 750 | 98.0 | 18.8 | null |
| "dye" | "bulk" | 540 | 400 | 600 | 590 | 540 | 690 | null | null | null |
| "dye" | "bulk" | 540 | 400 | 600 | 590 | 540 | 690 | null | null | null |
| "QD" | "bulk" | 730 | 600 | 900 | 910 | 800 | 1000 | null | 2.5 | null |
| "QD" | "bulk" | 730 | 600 | 900 | 910 | 800 | 1000 | null | 2.5 | null |
| "QD" | "fiber" | 730 | 600 | 900 | 910 | 800 | 1000 | null | 4.0 | null |
| "QD" | "fiber" | 730 | 600 | 900 | 910 | 800 | 1000 | null | 7.0 | null |
| "QD" | "solution" | 800 | 400 | 800 | 900 | 700 | 1000 | 30.0 | 1.4 | 3.2 |
| "QD" | "solution" | 600 | 400 | 600 | 630 | 600 | 700 | 50.0 | 0.5 | 1.2 |
| "dye" | "solution" | 550 | 450 | 600 | 580 | 550 | 650 | null | null | 1.3 |
| "QD" | "solution" | 376 | 300 | 650 | 623 | 600 | 700 | 50.0 | 0.3 | null |
| "dye" | "solution" | 413 | 400 | 600 | 657 | 600 | 800 | 67.0 | 3.4 | null |
| "dye" | "solution" | 550 | 450 | 600 | 629 | 550 | 750 | 95.0 | 2.6 | null |
| "dye" | "solution" | 466 | 300 | 750 | 685 | 700 | 900 | 11.0 | 0.6 | null |
| "dye" | "solution" | 557 | 300 | 750 | 802 | 700 | 900 | 9.0 | 0.5 | null |
| "polymer" | "solution" | 460 | 350 | 550 | 592 | 550 | 800 | 45.0 | 1.0 | null |
| "polymer" | "solution" | 467 | 300 | 550 | 593 | 550 | 700 | 48.0 | 0.9 | null |
| "dye" | "film" | 403 | 300 | 450 | 471 | 400 | 600 | 40.8 | 7.7 | null |
| "dye" | "bulk" | 374 | 300 | 400 | 450 | 400 | 600 | 100.0 | null | null |
| "dye" | "bulk" | 370 | 300 | 450 | 650 | 450 | 750 | 67.0 | 5.5 | null |
| "NP" | "bulk" | 470 | 400 | 550 | 597 | 500 | 725 | null | 4.09 | 3.55 |
| "dye" | "bulk" | 620 | 400 | 600 | 653 | 600 | 800 | null | null | 0.55 |
| "dye" | "bulk" | 340 | 300 | 400 | 540 | 400 | 700 | 14.0 | 0.25 | null |
| "dye" | "bulk" | 340 | 300 | 400 | 480 | 400 | 600 | 78.0 | 0.4 | null |
| "dye" | "bulk" | 580 | 300 | 600 | 620 | 600 | 800 | 100.0 | null | 0.0018 |
| "dye" | "film" | 745 | 400 | 850 | 808 | 700 | 900 | 25.0 | null | 0.61 |
| "dye" | "film" | 745 | 400 | 900 | 808 | 700 | 900 | 25.0 | null | 1.24 |
| "dye" | "film" | 745 | 400 | 850 | 808 | 700 | 900 | 25.0 | null | 0.54 |
| "dye" | "film" | 745 | 400 | 900 | 808 | 700 | 900 | 25.0 | null | 1.41 |
| "dye" | "bulk" | 525 | 400 | 650 | 610 | 500 | 800 | 97.6 | null | 2.6 |
| "NP" | "bulk" | 375 | 300 | 450 | 750 | 400 | 900 | 45.0 | 4.25 | 1.33 |
| "Lndye" | "fiber" | 540 | 350 | 550 | 630 | 500 | 700 | null | 0.29 | null |
| "dye" | "bulk" | 380 | 300 | 500 | 509 | 480 | 700 | null | 10.4 | 2.2 |
| "NP" | "bulk" | 550 | 400 | 750 | 800 | 600 | 1000 | 80.0 | 6.8 | null |
| "QD" | "film" | 450 | 370 | 600 | 630 | 600 | 700 | null | null | null |
| "QD" | "film" | 450 | 400 | 600 | 830 | 650 | 975 | 40.0 | null | null |
| "QD" | "waveguide" | 650 | 500 | 770 | 718 | 520 | 850 | 63.0 | 1.75 | null |
| "QD" | "solution" | 650 | 500 | 770 | 718 | 520 | 850 | 63.0 | 3.67 | null |
| "QD" | "film" | 480 | 300 | 500 | 619 | 570 | 670 | 36.2 | 2.95 | 2.25 |
| "CD" | "bulk" | 350 | 300 | 400 | 435 | 380 | 575 | null | 12.23 | 2.63 |
| "CD" | "bulk" | 350 | 280 | 400 | 450 | 390 | 600 | null | 4.52 | 2.49 |
| "CD" | "film" | 350 | 300 | 425 | 510 | 400 | 650 | 45.0 | 12.0 | null |
| "CD" | "film" | 358 | 225 | 425 | 441 | 400 | 600 | 94.0 | 3.9 | null |
| "CD" | "film" | 325 | 300 | 400 | 430 | 380 | 600 | null | 5.02 | 4.97 |
| "CD" | "bulk" | 340 | 300 | 600 | 540 | 450 | 700 | 40.0 | 0.92 | null |
| "CDQD" | "tandem" | 450 | 300 | 550 | 620 | 450 | 700 | 45.0 | 1.4 | null |
| "CDdye" | "bulk" | 470 | 350 | 550 | 560 | 380 | 750 | null | null | 4.06 |
| "CDQD" | "bulk" | 400 | 300 | 500 | 500 | 400 | 550 | 70.0 | null | 3.05 |
| "NP" | "film" | 400 | 300 | 500 | 600 | 500 | 700 | 25.0 | 1.85 | null |
| "NP" | "film" | 500 | 400 | 600 | 532 | 480 | 600 | 92.0 | null | null |
| "NP" | "bulk" | 325 | 300 | 520 | 518 | 475 | 525 | 58.0 | 2.4 | 1.8 |
| "CD" | "film" | 380 | 280 | 700 | 420 | 400 | 650 | null | 4.75 | 3.94 |
| "CD" | "bulk" | 440 | 300 | 500 | 540 | 450 | 700 | 25.0 | 1.2 | null |
| "QD" | "bulk" | 500 | 400 | 600 | 830 | 600 | 1000 | 50.0 | null | 2.85 |
| "QD" | "bulk" | 500 | 350 | 750 | 627 | 550 | 700 | 35.9 | 1.45 | null |
| "QD" | "film" | 450 | 350 | 750 | 862 | 800 | 1100 | 91.0 | 8.1 | 2.94 |
| "QD" | "bulk" | 350 | 300 | 500 | 550 | 450 | 775 | 81.0 | 26.5 | 8.71 |
| "QD" | "bulk" | 350 | 300 | 500 | 588 | 450 | 750 | null | null | 4.2 |
| "QD" | "film" | 350 | 400 | 800 | 740 | 600 | 850 | null | 6.97 | 3.18 |
| "QD" | "bulk" | 500 | 350 | 650 | 805 | 600 | 1000 | 78.0 | 6.4 | 3.1 |
| "QD" | "bulk" | 640 | 400 | 1000 | 960 | 700 | 1200 | 40.0 | null | 3.27 |
| "QD" | "bulk" | 415 | 310 | 620 | 918 | 620 | 1240 | 60.3 | null | 3.94 |
| "QD" | "bulk" | 375 | 300 | 500 | 638 | 500 | 825 | null | null | null |
| "QD" | "bulk" | 360 | 325 | 500 | 600 | 475 | 700 | 53.0 | null | null |
| "Ln" | "bulk" | 650 | 400 | 900 | 1140 | 1000 | 1200 | null | null | null |
| "dye" | "bulk" | 446 | 250 | 500 | 553 | 500 | 800 | 89.5 | 31.3 | null |
| "dye" | "bulk" | 478 | 250 | 600 | 601 | 550 | 800 | 61.1 | 22.0 | null |
| "dye" | "bulk" | 513 | 300 | 650 | 642 | 650 | 800 | 24.8 | 3.3 | null |
| "dye" | "bulk" | 449 | 250 | 550 | 571 | 500 | 800 | 80.0 | 27.8 | null |
| "dye" | "bulk" | 473 | 250 | 600 | 607 | 500 | 800 | 44.3 | 24.7 | null |
| "dye" | "film" | 760 | 600 | 800 | 787 | 700 | 900 | 24.0 | null | 0.44 |
| "dye" | "film" | 760 | 600 | 800 | 787 | 700 | 900 | 24.0 | null | 0.28 |
| "dye" | "film" | 700 | 550 | 800 | 784 | 700 | 900 | 30.0 | null | 0.62 |
| "dye" | "film" | 700 | 550 | 800 | 784 | 700 | 900 | 30.0 | null | 0.36 |
| "dye" | "film" | 738 | 550 | 850 | 819 | 700 | 900 | 23.0 | null | 0.41 |
| "dye" | "film" | 738 | 550 | 850 | 819 | 700 | 900 | 23.0 | null | 0.28 |
| "QD" | "bulk" | 450 | 350 | 1000 | 825 | 650 | 1000 | null | null | 7.9 |
| "QD" | "bulk" | 763 | 350 | 950 | 856 | 650 | 1050 | 70.0 | null | 4.74 |
| "dye" | "bulk" | 491 | 300 | 500 | 581 | 550 | 650 | 95.0 | 23.7 | 2.81 |
| "dye" | "bulk" | 580 | 350 | 600 | 620 | 400 | 800 | null | null | 7.1 |
| "dye" | "bulk" | 450 | 300 | 350 | 480 | 450 | 600 | 17.0 | null | 8.99 |
| "Ln" | "bulk" | 347 | 250 | 400 | 613 | 570 | 710 | null | 2.47 | 0.19 |
| "Ln" | "bulk" | 590 | 280 | 600 | 630 | 550 | 700 | 65.0 | null | 11.3 |
| "Ln" | "film" | 380 | 250 | 400 | 612 | 570 | 710 | 30.5 | 0.34 | 0.0019 |
| "Ln" | "film" | 360 | 250 | 380 | 545 | 450 | 700 | 1.6 | 0.27 | 0.00078 |
| "Ln" | "film" | 370 | 290 | 380 | 611 | 570 | 710 | 27.0 | 3.2 | 0.007 |
| "Ln" | "film" | 290 | 280 | 400 | 546 | 380 | 650 | 40.0 | 8.8 | null |
| "Ln" | "film" | 350 | 240 | 420 | 612 | 570 | 710 | 34.0 | 4.3 | null |
| "Ln" | "film" | 325 | 240 | 400 | 612 | 570 | 710 | 8.0 | 1.2 | null |
| "Ln" | "film" | 325 | 240 | 400 | 544 | 450 | 650 | 12.0 | 1.7 | null |
| "Ln" | "film" | 290 | 200 | 380 | 610 | 570 | 710 | null | null | null |
| "Ln" | "film" | 370 | 240 | 400 | 612 | 570 | 710 | 63.0 | 9.0 | null |
| "Ln" | "film" | 370 | 300 | 380 | 615 | 570 | 710 | 61.0 | 1.2 | 0.2 |
| "Ln" | "film" | 380 | 240 | 380 | 612 | 570 | 710 | 23.0 | 0.43 | 0.03 |
| "Ln" | "film" | 360 | 240 | 380 | 612 | 570 | 710 | 30.0 | 0.01 | 0.0006 |
| "Ln" | "fiber" | 360 | 240 | 420 | 615 | 570 | 710 | 85.0 | 2.3 | 0.00086 |
| "dye" | "film" | 520 | 300 | 540 | 590 | 550 | 700 | 78.0 | 0.08 | null |
| "dye" | "fiber" | 520 | 300 | 540 | 590 | 550 | 700 | 78.0 | 1.6 | 0.0052 |
| "dye" | "fiber" | 520 | 300 | 540 | 590 | 550 | 700 | 93.0 | 8.0 | 0.0024 |
| "Ln" | "film" | 340 | 250 | 380 | 613 | 570 | 710 | 44.0 | null | 0.0441 |
| "Ln" | "film" | 405 | 250 | 420 | 613 | 570 | 710 | 44.0 | null | 0.0499 |
| "Ln" | "film" | 350 | 250 | 400 | 613 | 570 | 710 | 73.0 | 0.28 | 0.28 |
| "Ln" | "film" | 320 | 250 | 350 | 613 | 570 | 710 | 86.0 | null | null |
| "dye" | "bulk" | 570 | 300 | 600 | 650 | 350 | 750 | 80.0 | 14.5 | 2.16 |
| "dye" | "film" | 770 | 300 | 800 | 775 | 700 | 950 | 16.0 | 1.5 | null |
| "dye" | "bulk" | 665 | 250 | 700 | 670 | 600 | 750 | 12.0 | 3.7 | 0.1 |
| "Ln" | "fiber" | 370 | 300 | 450 | 615 | 570 | 710 | 89.0 | 0.7 | null |
| "dye" | "fiber" | 560 | 300 | 600 | 580 | 550 | 700 | 95.0 | 2.1 | null |
| "dye" | "fiber" | 780 | 300 | 750 | 730 | 650 | 850 | 21.0 | 0.5 | null |
| "Ln" | "fiber" | 370 | 300 | 450 | 615 | 570 | 710 | 89.0 | null | 0.08 |
| "dye" | "fiber" | 560 | 300 | 600 | 580 | 550 | 700 | 95.0 | null | 0.21 |
| "QD" | "bulk" | 580 | 500 | 620 | 630 | 590 | 700 | 10.0 | null | 2.1 |
| "QD" | "bulk" | 580 | 500 | 620 | 630 | 590 | 700 | 60.0 | null | 2.1 |
| "dye" | "bulk" | 575 | 400 | 610 | 620 | 570 | 720 | 95.0 | null | 3.3 |
| "QD" | "bulk" | 580 | 500 | 620 | 630 | 590 | 700 | 10.0 | null | 2.1 |
| "QD" | "bulk" | 450 | 400 | 650 | 634 | 580 | 680 | 44.0 | null | 2.8 |
| "QD" | "bulk" | 450 | 400 | 600 | 580 | 550 | 650 | 86.0 | 48.0 | null |
| "dye" | "film" | 570 | 300 | 700 | 600 | 350 | 700 | null | null | null |
| "QD" | "film" | 396 | 350 | 450 | 582 | 500 | 700 | 53.0 | null | null |
| "Ln" | "bulk" | 530 | 300 | 580 | 630 | 550 | 750 | 83.0 | null | 1.44 |
| "QD" | "bulk" | 473 | 350 | 550 | 640 | 600 | 700 | 45.0 | 1.0 | null |
| "QD" | "bulk" | 350 | 300 | 450 | 550 | 450 | 750 | 56.0 | null | 8.71 |
| "QD" | "bulk" | 470 | 400 | 600 | 550 | 450 | 700 | 20.0 | 2.01 | null |
| "dye" | "bulk" | 521 | 450 | 550 | 539 | 500 | 700 | 93.0 | 54.0 | null |
| "CD" | "film" | 450 | 310 | 510 | 525 | 500 | 700 | 11.0 | 7.58 | 6.0 |
| "dye" | "film" | 545 | 400 | 700 | 680 | 550 | 800 | 50.0 | 12.5 | null |
| "dye" | "solution" | 498 | 300 | 700 | 580 | 520 | 700 | 30.0 | 6.88 | 0.27 |
| "dye" | "solution" | 569 | 350 | 600 | 595 | 550 | 750 | 61.0 | 2.58 | null |
| "dye" | "solution" | 488 | 300 | 500 | 510 | 450 | 600 | 51.0 | 3.3 | 0.35 |
| "CD" | "bulk" | 460 | 350 | 500 | 510 | 350 | 500 | 54.0 | 2.7 | null |
| "CD" | "bulk" | 340 | 300 | 500 | 520 | 400 | 700 | 6.0 | 5.84 | null |
| "CD" | "bulk" | 720 | 250 | 800 | 490 | 400 | 600 | 65.0 | null | 8.75 |
| "CD" | "film" | 420 | 300 | 550 | 515 | 400 | 700 | 40.0 | 1.6 | 0.7 |
| "CD" | "film" | 557 | 300 | 600 | 612 | 400 | 700 | 70.0 | 2.3 | null |
| "CD" | "film" | 420 | 300 | 550 | 500 | 440 | 600 | 67.0 | 2.2 | 1.13 |
| "dye" | "bulk" | 575 | 400 | 610 | 620 | 570 | 700 | 96.0 | 37.7 | null |
| "CD" | "solution" | 491 | 250 | 520 | 520 | 500 | 700 | 82.0 | 5.43 | 0.18 |
| "CD" | "film" | 510 | 250 | 520 | 535 | 400 | 650 | 78.0 | 0.058 | 0.00083 |
| "CD" | "film" | 510 | 250 | 520 | 535 | 400 | 650 | 78.0 | 1.7 | 0.014 |
| "dye" | "bulk" | 575 | 400 | 610 | 620 | 570 | 720 | 95.0 | 19.0 | 2.9 |
| "QD" | "bulk" | 480 | 400 | 660 | 622 | 550 | 700 | 15.7 | 3.2 | 0.62 |
| "CD" | "film" | 405 | 300 | 500 | 520 | 400 | 700 | 70.0 | 3.2 | 1.9 |
| "CD" | "film" | 405 | 300 | 500 | 520 | 400 | 700 | 65.0 | 2.9 | 1.7 |
| "QDdye" | "bulk" | 500 | 400 | 700 | 600 | 450 | 750 | 32.7 | 1.0 | null |
| "CD" | "film" | 347 | 280 | 500 | 540 | 450 | 700 | 61.0 | 4.56 | 4.1 |
| "Ln" | "bulk" | 405 | 250 | 500 | 520 | 450 | 700 | 81.0 | 3.4 | 1.37 |
| "CD" | "film" | 490 | 300 | 520 | 510 | 450 | 650 | 80.5 | null | 2.06 |
| "CD" | "film" | 490 | 300 | 520 | 510 | 450 | 650 | 80.5 | 4.8 | 4.36 |
| "CD" | "film" | 380 | 300 | 450 | 450 | 400 | 700 | 11.54 | 1.36 | null |
| "QD" | "film" | 300 | 300 | 550 | 515 | 450 | 550 | 35.91 | 3.08 | null |
| "QD" | "film" | 320 | 300 | 550 | 670 | 600 | 700 | 32.97 | 2.55 | null |
| "CDQD" | "film" | 320 | 300 | 550 | 515 | 400 | 700 | 23.0 | 1.89 | null |
| "CDQD" | "film" | 320 | 300 | 550 | 670 | 400 | 700 | 22.0 | 2.54 | null |
| "CDQD" | "film" | 320 | 300 | 550 | 500 | 400 | 700 | 26.0 | 3.76 | null |
| "CD" | "film" | 350 | 300 | 650 | 580 | 450 | 750 | 35.0 | null | 1.9 |
| "CD" | "film" | 380 | 300 | 500 | 520 | 400 | 700 | 35.0 | null | 1.7 |
| "CD" | "film" | 370 | 300 | 650 | 560 | 400 | 775 | 35.0 | null | 2.3 |
| "CD" | "film" | 400 | 300 | 500 | 517 | 450 | 700 | 33.0 | 4.5 | 0.117 |
| "CD" | "bulk" | 400 | 300 | 500 | 517 | 450 | 700 | 41.0 | 5.89 | 0.16 |
| "CD" | "bulk" | 400 | 300 | 500 | 517 | 450 | 700 | 41.0 | 3.13 | 0.061 |
| "CD" | "film" | 404 | 300 | 550 | 594 | 500 | 750 | 86.4 | 2.6 | 2.3 |
| "CD" | "film" | 555 | 300 | 650 | 650 | 575 | 800 | 17.6 | 3.0 | 2.7 |
| "CD" | "film" | 450 | 300 | 660 | 600 | 500 | 800 | 60.0 | 4.3 | 3.8 |
| "CD" | "solution" | 340 | 300 | 450 | 520 | 400 | 700 | null | 1.23 | 0.43 |
| "CD" | "film" | 340 | 300 | 450 | 520 | 400 | 700 | null | 0.9 | 0.62 |
| "CD" | "bulk" | 470 | 300 | 550 | 535 | 450 | 650 | 9.6 | 9.3 | null |
| "CD" | "film" | 380 | 300 | 500 | 570 | 450 | 700 | 41.52 | 3.51 | 2.39 |
| "CD" | "film" | 355 | 300 | 500 | 520 | 450 | 650 | 15.01 | 2.76 | 1.94 |
| "CD" | "film" | 410 | 300 | 600 | 608 | 550 | 750 | 7.6 | 2.77 | 1.96 |
| "CD" | "film" | 400 | 300 | 600 | 600 | 450 | 750 | 22.0 | 4.03 | 2.92 |
| "QD" | "bulk" | 600 | 400 | 650 | 625 | 550 | 700 | 30.0 | 2.7 | 0.38 |
| "QD" | "bulk" | 350 | 300 | 500 | 495 | 425 | 700 | 91.0 | null | 4.29 |
| "QD" | "bulk" | 350 | 300 | 500 | 495 | 425 | 700 | 91.0 | null | 0.55 |
| "QD" | "bulk" | 350 | 300 | 500 | 578 | 450 | 700 | 11.0 | null | 0.77 |
| "Ln" | "film" | 450 | 385 | 250 | 615 | 720 | 570 | 44.0 | 4.8 | 0.054 |
| "Ln" | "film" | 400 | 275 | 250 | 543 | 700 | 470 | 37.0 | 7.7 | 0.058 |
| "CD" | "film" | 425 | 340 | 250 | 425 | 600 | 400 | 11.0 | 13.1 | 0.053 |
| "Ln" | "film" | 450 | 385 | 250 | 615 | 720 | 570 | 44.0 | 5.2 | 0.046 |
| "Ln" | "film" | 400 | 275 | 250 | 543 | 700 | 470 | 37.0 | 7.7 | 0.047 |
| "CD" | "film" | 425 | 340 | 250 | 425 | 600 | 400 | 11.0 | 12.8 | 0.041 |
| "Ln" | "film" | 450 | 390 | 250 | 612 | 720 | 570 | 59.0 | 6.7 | 0.074 |
| "Ln" | "film" | 400 | 280 | 250 | 543 | 700 | 470 | 54.0 | 8.5 | 0.065 |
| "Ln" | "film" | 450 | 390 | 250 | 612 | 720 | 570 | 59.0 | 10.7 | 0.096 |
| "Ln" | "film" | 400 | 280 | 250 | 543 | 700 | 470 | 54.0 | 8.7 | 0.053 |
| "Ln" | "film" | 450 | 390 | 250 | 612 | 720 | 570 | 59.0 | 11.7 | 0.142 |
| "Ln" | "film" | 400 | 280 | 250 | 543 | 700 | 470 | 54.0 | 16.5 | 0.136 |
| "dye" | "film" | 650 | 415 | 250 | 690 | 900 | 600 | 4.0 | 2.6 | 0.044 |
| "polymer" | "film" | 500 | 392 | 250 | 491 | 650 | 400 | 47.0 | 5.71 | 2.29 |
| "polymer" | "film" | 500 | 392 | 250 | 473 | 650 | 450 | 57.0 | 9.112 | 2.32 |
| "polymer" | "film" | 500 | 392 | 250 | 454 | 650 | 400 | 68.0 | 12.08 | 2.47 |
| "polymer" | "film" | 500 | 392 | 250 | 491 | 650 | 400 | 47.0 | 5.71 | 4.38 |
| "polymer" | "film" | 500 | 392 | 250 | 473 | 650 | 450 | 57.0 | 9.112 | 4.62 |
| "polymer" | "film" | 500 | 392 | 250 | 454 | 650 | 400 | 68.0 | 12.08 | 4.92 |
| "NP" | "bulk" | 600 | 511 | 450 | 527 | 580 | 490 | 65.0 | 0.15 | 0.049413 |
| "dye" | "bulk" | 600 | 554 | 450 | 612 | 700 | 575 | 70.0 | 0.16 | 0.050786 |
| "NPdye" | "bulk" | 600 | 530 | 450 | 550 | 700 | 500 | 68.0 | 0.22 | 0.07531 |
| "QD" | "solution" | 510 | 450 | 300 | 517 | 550 | 475 | 89.0 | 2.32 | 0.020944 |
| "dye" | "solution" | 660 | 480 | 680 | 665 | 600 | 800 | 31.0 | 2.65 | 0.21 |
| "Ln" | "film" | 370 | 250 | 400 | 612 | 570 | 720 | 60.0 | 0.02 | 0.000198 |
| "CD" | "film" | 510 | 250 | 520 | 535 | 400 | 650 | 78.0 | 0.035 | 0.000182 |
| "Ln" | "film" | 370 | 250 | 400 | 612 | 570 | 720 | 60.0 | 0.048 | 0.000471 |
sns.pairplot(df.to_pandas())
fig = plt.gcf()
fig.savefig(f'figures/pairplot.pdf', bbox_inches='tight')
plt.show()
sns.pairplot(df.to_pandas(), hue='PCE (%)')
plt.show()
sns.pairplot(df.to_pandas(), hue='hopt (%)')
plt.show()
def idx_to_rowcol(idx, width):
r = int(idx / width)
c = int(idx % width)
return (r,c)
def rowcol_to_idx(r,c, width):
idx = int((r * width) + c)
return idx
def describe_variables(df, categorical=['mat0', 'mat1'], cols=4, figsize=(20,15), filename=None):
# get the pandas dataframe (since polars does not allow plots)
df_pandas = df.to_pandas()
for col in categorical:
df_pandas[col] = df_pandas[col].astype('category')
# compute the number of rows
total = (2 * len(df.get_columns())) - len(categorical)
rows = math.ceil(total/cols)
fig, axes = plt.subplots(rows, cols, constrained_layout = True, figsize=figsize)
#print(f'Total {total} -> ({rows} {cols})')
idx = 0
for col in df.get_columns():
r, c = idx_to_rowcol(idx, cols)
#print(f'({col.name}: {idx} -> {r}, {c})')
sns.histplot(ax=axes[r, c], data=df_pandas, x=col.name)
idx += 1
if col.name not in categorical:
r, c = idx_to_rowcol(idx, cols)
#print(f'({col.name}: {idx} -> {r}, {c})')
sns.boxplot(ax=axes[r, c], data=df_pandas, x=col.name)
idx += 1
if isinstance(filename, str):
fig = plt.gcf()
fig.savefig(f'figures/{filename}.pdf', bbox_inches='tight')
plt.show()
describe_variables(df, filename='feaures_description')
def print_missing_samples(df):
print(f'Shape: {df.shape}')
total, _ = df.shape
for col in df.get_columns():
print(f'{col.name:<6}: {col.is_null().sum()/total:.0%}')
print_missing_samples(df)
Shape: (201, 11) mat0 : 0% mat1 : 0% abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 14% hopt (%): 28% PCE (%): 36%
df_baseline_PCE = df.drop(['mat0', 'mat1', 'QY (%)', 'hopt (%)']).drop_nulls()
print_missing_samples(df_baseline_PCE)
Shape: (128, 7) abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% PCE (%): 0%
df_baseline_hopt = df.drop(['mat0', 'mat1', 'QY (%)', 'PCE (%)']).drop_nulls()
print_missing_samples(df_baseline_hopt)
Shape: (144, 7) abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% hopt (%): 0%
max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]
max_depth.append(None)
# TODO: fix this
# Print Models parameters
models = [('Linear Regression', LinearRegression(), {}),
('K Neighbors', KNeighborsRegressor(),
{'reg__n_neighbors':[1,3,5,7,9], 'reg__weights':['uniform', 'distance'], 'reg__p':[1,2]}),
('Random Forest', RandomForestRegressor(random_state=42),
{'reg__n_estimators':[50, 100, 150, 200], 'reg__min_samples_split':[2, 5, 10],
'reg__min_samples_leaf':[1, 2, 4], 'reg__bootstrap':[True, False], 'reg__max_depth':max_depth}),
('Gradient Boosting', GradientBoostingRegressor(random_state=42),
{'reg__n_estimators':[50, 100, 150, 200],'reg__min_samples_split':[2, 5, 10],
'reg__min_samples_leaf':[1, 2, 4], 'reg__max_depth':max_depth}),
('XGBoost', xgb.XGBRegressor(objective="reg:squarederror", random_state=42),
{'reg__max_depth': range (2, 10, 1), 'reg__n_estimators': range(60, 220, 40), 'reg__learning_rate': [0.1, 0.01, 0.05]})]
# remove heavy models that do not perform that well
#('Lasso Regression', Lasso(max_iter=5000, tol=1E-2, random_state=42), {'alpha': (np.logspace(-8, 8, 20))}),
#('Support Vector Regressor', SVR(),{'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01],'kernel': ['linear','rbf', 'sigmoid']}),
#('ANN', MLPRegressor(max_iter=50000, tol=1E-3, random_state=42),{'hidden_layer_sizes':[(8,), (16,), (32,)],'activation':['relu','logistic'], 'learning_rate':['constant', 'invscaling', 'adaptive']}),
def create_folds(X, y, b='auto', k=5):
if isinstance(b, str):
bins = np.histogram_bin_edges(y, bins=b)
# remove the last index (end point)
bins = bins[:-1]
elif isinstance(b, int):
bins = np.linspace(min(y), max(y), num=b, endpoint=False)
else:
raise Exception(f'Undefined bins {b}')
#print(f'Bins: {bins}')
groups = np.digitize(y, bins)
#print(f'Group: {groups}')
skf = StratifiedKFold(n_splits=k)
return skf.split(X, groups)
from collections.abc import Iterable
def compute_performance(models, X, y, b=None, k=[5, 3], filename=None):
if isinstance(k, Iterable):
outer_k = k[0]
inner_k = k[1]
elif isinstance(k, int):
outer_k = k
inner_k = k
else:
raise Exception(f'Undefined k {k}')
#kf = KFold(n_splits=cv)
folds = create_folds(X, y, b=b, k=outer_k)
perf_per_model = {}
y_true = []
y_true_train = []
y_preds = {}
y_preds_train = {}
#for train_index, test_index in kf.split(X):
for train_index, test_index in folds:
X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
y_true.extend(y_test)
y_true_train.extend(y_train)
# optimize each model and store the best result
best_models = []
for _, model, params in models:
pipeline = Pipeline([('sca', StandardScaler()),('reg', model)])
#clf = GridSearchCV(pipeline, params, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_folds = create_folds(X_train, y_train, b=b, k=inner_k)
#clf = GridSearchCV(pipeline, params, cv=cv, scoring='r2', n_jobs=-1)
clf = GridSearchCV(pipeline, params, cv=grid_folds, scoring='r2', n_jobs=-1)
clf.fit(X_train, y_train)
best_models.append(clf.best_estimator_)
# compute the predictions and store the results
for i in range(len(models)):
model = best_models[i]
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
if models[i][0] not in y_preds:
y_preds[models[i][0]] = []
if models[i][0] not in y_preds_train:
y_preds_train[models[i][0]] = []
y_preds[models[i][0]].extend(y_pred)
y_preds_train[models[i][0]].extend(y_pred_train)
score_mae_test = mean_absolute_error(y_test, y_pred)
score_mse_test = mean_squared_error(y_test, y_pred)
score_r2_test = r2_score(y_test, y_pred)
score_mae_train = mean_absolute_error(y_train, y_pred_train)
score_mse_train = mean_squared_error(y_train, y_pred_train)
score_r2_train = r2_score(y_train, y_pred_train)
if models[i][0] not in perf_per_model:
perf_per_model[models[i][0]] = {'train':{'mae':[], 'mse': [], 'r2':[]},
'test':{'mae':[], 'mse': [], 'r2':[]}}
perf_per_model[models[i][0]]['test']['mae'].append(score_mae_test)
perf_per_model[models[i][0]]['test']['mse'].append(score_mse_test)
perf_per_model[models[i][0]]['test']['r2'].append(score_r2_test)
perf_per_model[models[i][0]]['train']['mae'].append(score_mae_train)
perf_per_model[models[i][0]]['train']['mse'].append(score_mse_train)
perf_per_model[models[i][0]]['train']['r2'].append(score_r2_train)
# print the results
print(f'Train')
print(f'| {"Model":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
print(f'| ----------------- | ------ | ------ | ------ |')
for i in range(len(models)):
mae = perf_per_model[models[i][0]]['train']['mae']
mse = perf_per_model[models[i][0]]['train']['mse']
r2 = perf_per_model[models[i][0]]['train']['r2']
#print(f'{mae} | {mse} | {r2}')
mae = np.mean(mae)
mse = np.mean(mse)
r2 = np.mean(r2)
print(f'| {models[i][0]:<17} | {round(mae, 2):>6} | {round(mse, 2):>6} | {round(r2, 2):>6} |')
print()
print(f'Test')
print(f'| {"Model":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
print(f'| ----------------- | ------ | ------ | ------ |')
for i in range(len(models)):
mae = perf_per_model[models[i][0]]['test']['mae']
mse = perf_per_model[models[i][0]]['test']['mse']
r2 = perf_per_model[models[i][0]]['test']['r2']
#print(f'{mae} | {mse} | {r2}')
mae = np.mean(mae)
mse = np.mean(mse)
r2 = np.mean(r2)
print(f'| {models[i][0]:<17} | {round(mae, 2):>6} | {round(mse, 2):>6} | {round(r2, 2):>6} |')
print()
# plot the graphs
print(f'Train')
data_train = []
labels_train = []
for name, _, _ in models:
labels_train.append(name)
# DEBUG STUFF:
#score_mae = mean_absolute_error(y_true_train, y_preds_train[name])
#score_mse = mean_squared_error(y_true_train, y_preds_train[name])
#score_r2 = r2_score(y_true_train, y_preds_train[name])
#print(f'Model {name} {score_mae} {score_mse} {score_r2}')
data_train.append(np.abs(np.subtract(y_true_train, y_preds_train[name])))
ax = sns.violinplot(data=data_train)
ax.set_xticklabels(labels_train)
if isinstance(filename, str):
fig = plt.gcf()
fig.savefig(f'figures/{filename}_train.pdf', bbox_inches='tight')
plt.show()
print(f'Test')
data_preds = []
labels_preds = []
for name, _, _ in models:
labels_preds.append(name)
# DEBUG STUFF:
#score_mae = mean_absolute_error(y_true, y_preds[name])
#score_mse = mean_squared_error(y_true, y_preds[name])
#score_r2 = r2_score(y_true, y_preds[name])
#print(f'Model {name} {score_mae} {score_mse} {score_r2}')
data_preds.append(np.abs(np.subtract(y_true, y_preds[name])))
ax = sns.violinplot(data=data_preds)
ax.set_xticklabels(labels_preds)
if isinstance(filename, str):
fig = plt.gcf()
fig.savefig(f'figures/{filename}_test.pdf', bbox_inches='tight')
plt.show()
# convert best_models to a dict and return it
bm = {}
for name, model, params in models:
# train on whole dataset
pipeline = Pipeline([('sca', StandardScaler()),('reg', model)])
#clf = GridSearchCV(pipeline, params, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_folds = create_folds(X, y, b=b, k=inner_k)
clf = GridSearchCV(pipeline, params, cv=grid_folds, scoring='r2', n_jobs=-1)
clf.fit(X, y)
bm[name] = clf.best_estimator_
return bm
# Get Output variables
PCE = df_baseline_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]
#print(f'{y_PCE} {y_PCE.shape}')
# Get the Input variables
df_input = df_baseline_PCE.drop(['PCE (%)'])
X = df_input.to_numpy()
#print(f'{X} {X.shape}')
## PCE
print(f'PCE')
models_pce_baseline = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_baseline')
PCE Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.53 | 4.39 | 0.13 | | K Neighbors | 0.22 | 0.46 | 0.91 | | Random Forest | 0.85 | 1.95 | 0.61 | | Gradient Boosting | 0.25 | 0.26 | 0.95 | | XGBoost | 0.76 | 1.56 | 0.69 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.66 | 5.15 | -0.01 | | K Neighbors | 1.31 | 4.1 | 0.2 | | Random Forest | 1.28 | 4.04 | 0.25 | | Gradient Boosting | 1.3 | 4.42 | 0.16 | | XGBoost | 1.32 | 4.26 | 0.2 | Train
Test
# Get Output variables
hopt = df_baseline_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]
# Get the Input variables
df_input = df_baseline_hopt.drop(['hopt (%)'])
X = df_input.to_numpy()
## hopt
print(f'hopt')
models_hopt_baseline = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_baseline')
hopt Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 5.14 | 65.18 | 0.07 | | K Neighbors | 0.98 | 8.88 | 0.87 | | Random Forest | 3.09 | 28.37 | 0.6 | | Gradient Boosting | 1.13 | 4.37 | 0.94 | | XGBoost | 3.82 | 49.78 | 0.3 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 5.71 | 75.71 | -0.15 | | K Neighbors | 4.69 | 62.34 | -0.05 | | Random Forest | 5.19 | 68.62 | -0.12 | | Gradient Boosting | 4.76 | 67.24 | -0.28 | | XGBoost | 4.54 | 66.16 | 0.02 | Train
Test
df_02_PCE = df.drop(['mat0', 'mat1', 'hopt (%)']).drop_nulls()
print_missing_samples(df_02_PCE)
Shape: (112, 8) abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 0% PCE (%): 0%
df_02_hopt = df.drop(['mat0', 'mat1', 'PCE (%)']).drop_nulls()
print_missing_samples(df_02_hopt)
Shape: (129, 8) abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 0% hopt (%): 0%
# Get Output variables
PCE = df_02_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]
# Get the Input variables
df_input = df_02_PCE.drop(['PCE (%)'])
X = df_input.to_numpy()
## PCE
print(f'PCE')
models_02_pce = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_qy')
PCE Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.48 | 4.21 | 0.13 | | K Neighbors | 0.12 | 0.15 | 0.97 | | Random Forest | 0.78 | 1.78 | 0.63 | | Gradient Boosting | 0.26 | 0.28 | 0.94 | | XGBoost | 1.01 | 2.76 | 0.42 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.64 | 4.99 | -0.17 | | K Neighbors | 1.11 | 3.8 | 0.17 | | Random Forest | 1.32 | 4.37 | -0.23 | | Gradient Boosting | 1.44 | 5.48 | -0.64 | | XGBoost | 1.43 | 5.41 | -0.32 | Train
Test
# Get Output variables
hopt = df_02_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]
# Get the Input variables
df_input = df_02_hopt.drop(['hopt (%)'])
X = df_input.to_numpy()
## hopt
print(f'hopt')
models_02_hopt = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_qy')
hopt Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 5.25 | 62.42 | 0.18 | | K Neighbors | 2.38 | 23.96 | 0.68 | | Random Forest | 3.43 | 36.95 | 0.52 | | Gradient Boosting | 0.8 | 3.32 | 0.95 | | XGBoost | 3.37 | 37.93 | 0.51 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 5.86 | 73.91 | -0.39 | | K Neighbors | 4.56 | 50.99 | -0.01 | | Random Forest | 4.97 | 64.28 | -0.26 | | Gradient Boosting | 4.95 | 70.14 | -0.62 | | XGBoost | 4.21 | 63.4 | -0.04 | Train
Test
df_03_PCE = df.drop(['hopt (%)']).drop_nulls()
print_missing_samples(df_03_PCE)
Shape: (112, 10) mat0 : 0% mat1 : 0% abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 0% PCE (%): 0%
df_03_hopt = df.drop(['PCE (%)']).drop_nulls()
print_missing_samples(df_03_hopt)
Shape: (129, 10) mat0 : 0% mat1 : 0% abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 0% hopt (%): 0%
# Get Output variables
PCE = df_03_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]
# Get the Input variables
df_input = df_03_PCE.drop(['mat0', 'mat1', 'PCE (%)'])
X = df_input.to_numpy()
temp_mat0 = df_03_PCE[['mat0']].to_numpy()
enc = OneHotEncoder(min_frequency=10, sparse_output=False)
encoded_mat0 = enc.fit_transform(temp_mat0)
temp_mat1 = df_03_PCE[['mat1']].to_numpy()
enc = OneHotEncoder(min_frequency=20, sparse_output=False)
encoded_mat1 = enc.fit_transform(temp_mat1)
X = np.concatenate((encoded_mat0, encoded_mat1, X), axis=1)
## PCE
print(f'PCE')
models_03_pce = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_qy_mat')
PCE Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.17 | 3.16 | 0.34 | | K Neighbors | 0.12 | 0.15 | 0.97 | | Random Forest | 0.79 | 1.82 | 0.62 | | Gradient Boosting | 0.27 | 0.28 | 0.94 | | XGBoost | 1.01 | 2.77 | 0.42 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.44 | 4.72 | -0.34 | | K Neighbors | 1.1 | 4.02 | 0.03 | | Random Forest | 1.3 | 4.35 | -0.2 | | Gradient Boosting | 1.29 | 5.35 | -0.53 | | XGBoost | 1.38 | 4.91 | -0.11 | Train
Test
# Get Output variables
hopt = df_03_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]
# Get the Input variables
df_input = df_03_hopt.drop(['mat0', 'mat1', 'hopt (%)'])
X = df_input.to_numpy()
temp_mat0 = df_03_hopt[['mat0']].to_numpy()
enc = OneHotEncoder(min_frequency=10, sparse_output=False)
encoded_mat0 = enc.fit_transform(temp_mat0)
temp_mat1 = df_03_hopt[['mat1']].to_numpy()
enc = OneHotEncoder(min_frequency=20, sparse_output=False)
encoded_mat1 = enc.fit_transform(temp_mat1)
X = np.concatenate((encoded_mat0, encoded_mat1, X), axis=1)
## hopt
print(f'hopt')
models_03_hopt = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_qy_mat')
hopt Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 4.71 | 47.33 | 0.38 | | K Neighbors | 0.99 | 9.13 | 0.89 | | Random Forest | 2.27 | 17.14 | 0.78 | | Gradient Boosting | 0.49 | 1.48 | 0.98 | | XGBoost | 2.04 | 11.67 | 0.85 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 5.62 | 64.31 | -0.26 | | K Neighbors | 4.12 | 47.73 | 0.15 | | Random Forest | 4.06 | 43.72 | 0.19 | | Gradient Boosting | 4.52 | 60.79 | -0.07 | | XGBoost | 3.67 | 40.66 | 0.3 | Train
Test
def iqr_range(array):
Q1 = np.percentile(array, 25)
Q3 = np.percentile(array, 75)
IQR = Q3 - Q1
return ((Q1 - 1.5 * IQR),(Q3 + 1.5 * IQR))
PCE_array = df[['PCE (%)']].drop_nulls().to_numpy()[:,0]
#PCE_array
pce_low, pce_high = iqr_range(PCE_array)
print(f'PCE [{pce_low}, {pce_high}]')
PCE [-4.000000000000001, 6.912000000000001]
df_filtered_PCE = df.filter((pl.col('PCE (%)') > pce_low) & (pl.col('PCE (%)') < pce_high))
sns.boxplot(data=df_filtered_PCE.to_pandas(), x='PCE (%)')
plt.show()
print(f'PCE: {df_filtered_PCE.shape}')
PCE: (121, 11)
hopt_array = df[['hopt (%)']].drop_nulls().to_numpy()[:,0]
#hopt_array
hopt_low, hopt_high = iqr_range(hopt_array)
print(f'hopt [{hopt_low}, {hopt_high}]')
hopt [-6.99875, 15.63125]
df_filtered_hopt = df.filter((pl.col('hopt (%)') > pce_low) & (pl.col('hopt (%)') < pce_high))
sns.boxplot(data=df_filtered_hopt.to_pandas(), x='hopt (%)')
plt.show()
print(f'nhop: {df_filtered_hopt.shape}')
nhop: (106, 11)
#models_gbr = [('Gradient Boosting Regressor', GradientBoostingRegressor(random_state=42),
#{'n_estimators':[50, 100, 150],'min_samples_split':[2, 5, 10], 'min_samples_leaf':[1, 2, 4],'max_depth':max_depth})]
df_out_baseline_PCE = df_filtered_PCE.drop(['mat0', 'mat1', 'QY (%)', 'hopt (%)']).drop_nulls()
print_missing_samples(df_out_baseline_PCE)
Shape: (121, 7) abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% PCE (%): 0%
# Get Output variables
PCE = df_out_baseline_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]
# Get the Input variables
df_input = df_out_baseline_PCE.drop(['PCE (%)'])
X = df_input.to_numpy()
## PCE
print(f'PCE')
models_out_pce_baseline = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_nout_baseline')
PCE Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.15 | 2.0 | 0.14 | | K Neighbors | 0.26 | 0.39 | 0.83 | | Random Forest | 0.45 | 0.46 | 0.8 | | Gradient Boosting | 0.17 | 0.15 | 0.93 | | XGBoost | 0.54 | 0.69 | 0.7 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.25 | 2.35 | -0.01 | | K Neighbors | 0.99 | 1.94 | 0.1 | | Random Forest | 0.91 | 1.62 | 0.25 | | Gradient Boosting | 0.79 | 1.53 | 0.28 | | XGBoost | 1.05 | 2.08 | 0.03 | Train
Test
df_out_baseline_hopt = df_filtered_hopt.drop(['mat0', 'mat1', 'QY (%)', 'PCE (%)']).drop_nulls()
print_missing_samples(df_out_baseline_hopt)
Shape: (106, 7) abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% hopt (%): 0%
# Get Output variables
hopt = df_out_baseline_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]
# Get the Input variables
df_input = df_out_baseline_hopt.drop(['hopt (%)'])
X = df_input.to_numpy()
## hopt
print(f'hopt')
models_out_hopt_baseline = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_nout_baseline')
hopt Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.43 | 3.11 | 0.04 | | K Neighbors | 0.1 | 0.09 | 0.97 | | Random Forest | 0.85 | 1.11 | 0.66 | | Gradient Boosting | 0.32 | 0.22 | 0.93 | | XGBoost | 1.07 | 1.76 | 0.46 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.59 | 3.86 | -0.18 | | K Neighbors | 1.36 | 2.88 | 0.09 | | Random Forest | 1.44 | 3.04 | 0.04 | | Gradient Boosting | 1.5 | 3.39 | -0.06 | | XGBoost | 1.45 | 3.18 | 0.01 | Train
Test
df_out_02_PCE = df_filtered_PCE.drop(['mat0', 'mat1', 'hopt (%)']).drop_nulls()
print_missing_samples(df_out_02_PCE)
Shape: (107, 8) abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 0% PCE (%): 0%
df_out_02_hopt = df_filtered_hopt.drop(['mat0', 'mat1', 'PCE (%)']).drop_nulls()
print_missing_samples(df_out_02_hopt)
Shape: (95, 8) abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 0% hopt (%): 0%
# Get Output variables
PCE = df_out_02_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]
# Get the Input variables
df_input = df_out_02_PCE.drop(['PCE (%)'])
X = df_input.to_numpy()
## PCE
print(f'PCE')
models_out_02_pce = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_nout_qy')
PCE Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.11 | 1.91 | 0.12 | | K Neighbors | 0.12 | 0.14 | 0.93 | | Random Forest | 0.39 | 0.37 | 0.83 | | Gradient Boosting | 0.16 | 0.16 | 0.93 | | XGBoost | 0.56 | 0.71 | 0.67 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.27 | 2.42 | -0.15 | | K Neighbors | 0.99 | 1.97 | 0.11 | | Random Forest | 0.91 | 1.65 | 0.24 | | Gradient Boosting | 0.96 | 2.03 | 0.05 | | XGBoost | 0.97 | 1.85 | 0.15 | Train
Test
# Get Output variables
hopt = df_out_02_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]
# Get the Input variables
df_input = df_out_02_hopt.drop(['hopt (%)'])
X = df_input.to_numpy()
## hopt
print(f'hopt')
models_out_02_hopt = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_nout_qy')
hopt Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.45 | 3.18 | 0.04 | | K Neighbors | 0.48 | 0.87 | 0.74 | | Random Forest | 0.66 | 0.74 | 0.78 | | Gradient Boosting | 0.21 | 0.13 | 0.96 | | XGBoost | 0.97 | 1.52 | 0.54 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.63 | 4.12 | -0.3 | | K Neighbors | 1.36 | 2.91 | 0.07 | | Random Forest | 1.4 | 2.92 | 0.07 | | Gradient Boosting | 1.47 | 3.55 | -0.14 | | XGBoost | 1.42 | 3.15 | 0.01 | Train
Test
df_out_03_PCE = df_filtered_PCE.drop(['hopt (%)']).drop_nulls()
print_missing_samples(df_out_03_PCE)
Shape: (107, 10) mat0 : 0% mat1 : 0% abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 0% PCE (%): 0%
df_out_03_hopt = df_filtered_hopt.drop(['PCE (%)']).drop_nulls()
print_missing_samples(df_out_03_hopt)
Shape: (95, 10) mat0 : 0% mat1 : 0% abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 0% hopt (%): 0%
# Get Output variables
PCE = df_out_03_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]
# Get the Input variables
df_input = df_out_03_PCE.drop(['mat0', 'mat1', 'PCE (%)'])
X = df_input.to_numpy()
temp_mat0 = df_out_03_PCE[['mat0']].to_numpy()
enc = OneHotEncoder(min_frequency=10, sparse_output=False)
encoded_mat0 = enc.fit_transform(temp_mat0)
temp_mat1 = df_out_03_PCE[['mat1']].to_numpy()
enc = OneHotEncoder(min_frequency=20, sparse_output=False)
encoded_mat1 = enc.fit_transform(temp_mat1)
X = np.concatenate((encoded_mat0, encoded_mat1, X), axis=1)
## PCE
print(f'PCE')
models_out_03_pce = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_nout_qy_mat')
PCE Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 0.79 | 1.15 | 0.47 | | K Neighbors | 0.17 | 0.21 | 0.9 | | Random Forest | 0.35 | 0.3 | 0.86 | | Gradient Boosting | 0.14 | 0.15 | 0.93 | | XGBoost | 0.4 | 0.45 | 0.79 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.02 | 1.8 | 0.16 | | K Neighbors | 0.88 | 1.72 | 0.22 | | Random Forest | 0.88 | 1.56 | 0.28 | | Gradient Boosting | 0.96 | 1.99 | 0.06 | | XGBoost | 1.0 | 2.16 | -0.02 | Train
Test
# Get Output variables
hopt = df_out_03_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]
# Get the Input variables
df_input = df_out_03_hopt.drop(['mat0', 'mat1', 'hopt (%)'])
X = df_input.to_numpy()
temp_mat0 = df_out_03_hopt[['mat0']].to_numpy()
enc = OneHotEncoder(min_frequency=10, sparse_output=False)
encoded_mat0 = enc.fit_transform(temp_mat0)
temp_mat1 = df_out_03_hopt[['mat1']].to_numpy()
enc = OneHotEncoder(min_frequency=20, sparse_output=False)
encoded_mat1 = enc.fit_transform(temp_mat1)
X = np.concatenate((encoded_mat0, encoded_mat1, X), axis=1)
## PCE
print(f'hopt')
models_out_03_hopt = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_nout_qy_mat')
hopt Train | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.36 | 2.84 | 0.14 | | K Neighbors | 0.21 | 0.36 | 0.89 | | Random Forest | 0.63 | 0.64 | 0.81 | | Gradient Boosting | 0.18 | 0.11 | 0.97 | | XGBoost | 0.97 | 1.53 | 0.54 | Test | Model | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Linear Regression | 1.71 | 4.66 | -0.48 | | K Neighbors | 1.4 | 3.05 | 0.06 | | Random Forest | 1.41 | 3.02 | 0.04 | | Gradient Boosting | 1.49 | 3.6 | -0.12 | | XGBoost | 1.43 | 3.28 | -0.04 | Train
Test
df = pl.read_excel('db.xlsx')
#df = df.drop(['#'])
df_clustering_index = df.drop(['mat0', 'mat1', 'hopt (%)', 'PCE (%)'])
df_clustering_index = df_clustering_index.drop_nulls()
df_clustering = df_clustering_index.drop(['#'])
print_missing_samples(df_clustering)
Shape: (173, 7) abs_peak: 0% abs_min: 0% abs_max: 0% em_peak: 0% em_min: 0% em_max: 0% QY (%): 0%
import sklearn.metrics as sklearn_metrics
def inertia_score(X, labels):
"""This is the same of scikit-learn's KMeans.inertia_, but it works also in case of only one label."""
inertia = 0.0
for label in set(labels):
X_cluster = X[labels == label, :]
centroid = np.mean(X_cluster, axis=0)
inertia += np.sum((X_cluster - centroid) ** 2)
return inertia
def calinski_harabasz_score(X, labels):
"""Wrapper function of Scikit-learn's calinski_harabasz_score. The only difference is it doesn't throw an error where there is only one label."""
if len(set(labels)) == 1:
return float("NaN")
else:
return sklearn_metrics.calinski_harabasz_score(X, labels)
def davies_bouldin_score(X, labels):
"""Wrapper function of Scikit-learn's davies_bouldin_score. The only difference is it doesn't throw an error where there is only one label."""
if len(set(labels)) == 1:
return float("NaN")
else:
return sklearn_metrics.davies_bouldin_score(X, labels)
def silhouette_score(X, labels):
"""Wrapper function of Scikit-learn's silhouette_score. The only difference is it doesn't throw an error where there is only one label."""
if len(set(labels)) == 1:
return float("NaN")
else:
return sklearn_metrics.silhouette_score(X, labels)
def get_bic_aic(k, X):
gmm = GaussianMixture(n_components=k, init_params='kmeans')
gmm.fit(X)
return gmm.bic(X), gmm.aic(X)
# Cluster the materials without the hopt and PCE
X = df_clustering.to_numpy()
print(f'X = {X.shape}')
scores = {'elbow': [], 'calinski-harabasz': [], 'davies-bouldin': [], 'silhouette': [], 'bic': [], }
min_k = 2
max_k = 20
for k in range(min_k, max_k):
kmeans = KMeans(n_clusters=k, max_iter=1000, n_init='auto', init='k-means++', random_state=5).fit(X)
labels = kmeans.labels_
scores['elbow'].append(inertia_score(X, labels))
scores['calinski-harabasz'].append(calinski_harabasz_score(X, labels))
scores['davies-bouldin'].append(davies_bouldin_score(X, labels))
scores['silhouette'].append(silhouette_score(X, labels))
bic, _ = get_bic_aic(k, X)
scores['bic'].append(bic)
X = (173, 7)
x = range(min_k, max_k)
y = scores['elbow']
points = np.zeros((len(y),2))
points[:,0] = x
points[:,1] = y
methods = ['elbow', 'calinski-harabasz', 'davies-bouldin', 'silhouette', 'bic']
fig, axs = plt.subplots(1, 5)
for i in range(len(methods)):
method = methods[i]
axs[i].plot(range(min_k, max_k), scores[method])
if method == 'elbow':
idx = kneedle.auto_knee(points)
elif method == 'calinski-harabasz':
idx = scores[method].index(max(scores[method]))
#axs[i].plot(x[idx], scores[method][idx], 'ro')
elif method == 'davies-bouldin':
idx = scores[method].index(min(scores[method]))
#axs[i].plot(x[idx], scores[method][idx], 'ro')
elif method == 'silhouette':
idx = scores[method].index(max(scores[method]))
#axs[i].plot(x[idx], scores[method][idx], 'ro')
elif method == 'bic':
idx = scores[method].index(max(scores[method]))
#axs[i].plot(x[idx], scores[method][idx], 'ro')
axs[i].plot(x[idx], scores[method][idx], 'ro')
fig = plt.gcf()
fig.savefig(f'figures/kmeans.pdf', bbox_inches='tight')
plt.show()
def compute_avg_field_per_cluster(df_origin, df_clustering, labels, field='PCE (%)'):
field_per_cluster = []
min_k = min(labels)
max_k = max(labels)
#print(f'[{min_k}, {max_k}]')
for k in range(min_k, max_k+1):
mask = (labels == k)
#print(f'k={k} = {mask} {mask.shape} {df.shape}')
df_k = df_clustering.filter(mask)
#print(f'{df_k}')
cluster_labels = df_k['#'].to_list()
#print(f'{cluster_labels}')
filter_df = df_origin.filter(pl.col('#').is_in(cluster_labels))
#print(filter_df)
mean_field = filter_df[field].mean()
field_per_cluster.append(mean_field)
return field_per_cluster
def plot_stats_per_cluster(df_origin, df_clustering, labels):
min_k = min(labels)
max_k = max(labels)
#print(f'[{min_k}, {max_k}]')
for k in range(min_k, max_k+1):
print(f'Cluster: {k}')
mask = (labels == k)
#print(f'k={k} = {mask} {mask.shape} {df.shape}')
df_k = df_clustering.filter(mask)
#print(f'{df_k}')
cluster_labels = df_k['#'].to_list()
#print(f'{cluster_labels}')
filter_df = df_origin.filter(pl.col('#').is_in(cluster_labels))
describe_variables(filter_df.drop(['#']), filename=f'cluster_{k}')
def find_nearest(array, value):
diff = np.sum(np.abs(array - value), axis=1)
idx = diff.argmin()
return idx
def find_nearest_df(df, df_origin, value, k, field):
df_no_id = df.drop(['#'])
array = df_no_id.to_numpy()
diff = np.sum(np.abs(array - value), axis=1)
idxs = []
i = j = 0
while i < k and j < len(diff):
j += 1
idx = diff.argmin()
#print(f'Current idx {idx}')
diff[idx] = float('inf')
#print(f'{df_origin.row(idx, named=True)}')
if df_origin.row(idx, named=True)[field] is not None:
idxs.append(idx)
i += 1
idxs = [df.row(i, named=True)['#'] for i in idxs]
return idxs
def get_missing(df, field):
df_qy_not_null = df.filter(pl.col('QY (%)').is_not_null())
missing_df = df_qy_not_null.filter(pl.col(field).is_null())
missing = missing_df.drop(['#', 'mat0', 'mat1', 'hopt (%)', 'PCE (%)']).to_numpy()
return missing
#return missing_df
#vector_df = missing_df.drop(['#', 'mat0', 'mat1', 'hopt (%)', 'PCE (%)'])
#missing = vector_df.to_numpy()
def predict_field_clustering(df_origin, df_clustering, kmeans, k=3, field='PCE (%)'):
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
#centroid_labels = [centroids[i] for i in labels]
#print(f'{labels} {centroids}')
missing = get_missing(df_origin, field)
#missing = missing_df.drop(['#', 'mat0', 'mat1', 'hopt (%)', 'PCE (%)']).to_numpy()
# for each missing point, find the closest cluster
rv = []
for row in missing:
closest_cluster = find_nearest(centroids, row)
# select the corresponding cluster
mask = (labels == closest_cluster)
df_k = df_clustering.filter(mask)
temp_id = df_k['#'].to_list()
df_origin_k = df_origin.filter(pl.col('#').is_in(temp_id))
idxs = find_nearest_df(df_k, df_origin_k, row, k, field)
filter_df = df_origin.filter(pl.col('#').is_in(idxs))
# compute the average
pce_k = filter_df[field].to_numpy()
rv.append((np.mean(pce_k), np.median(pce_k)))
return rv
def get_clusters_df(df_origin, df_clustering, labels):
min_k = min(labels)
max_k = max(labels)
df_per_cluster = []
for k in range(min_k, max_k+1):
mask = (labels == k)
#print(f'k={k} = {mask} {mask.shape} {df.shape}')
df_k = df_clustering.filter(mask)
#print(f'{df_k}')
cluster_labels = df_k['#'].to_list()
#print(f'{cluster_labels}')
filter_df = df_origin.filter(pl.col('#').is_in(cluster_labels))
#print(filter_df)
#mean_field = filter_df[field].mean()
#field_per_cluster.append(mean_field)
df_per_cluster.append(filter_df)
return df_per_cluster
def rmse(measured, truth):
return np.linalg.norm(measured - truth) / np.sqrt(len(truth))
k=5
kmeans = KMeans(n_clusters=k, max_iter=1000, n_init='auto', init='k-means++', random_state=5).fit(X)
labels = kmeans.labels_
avg_pce_cluster = compute_avg_field_per_cluster(df, df_clustering_index, labels, field='PCE (%)')
print(f'Avg PCE cluster: {avg_pce_cluster}')
avg_hopt_cluster = compute_avg_field_per_cluster(df, df_clustering_index, labels, field='hopt (%)')
print(f'Avg hOPT cluster: {avg_hopt_cluster}')
predictions_pce = predict_field_clustering(df, df_clustering_index, kmeans, k=3, field='PCE (%)')
print(f'Predictions PCE: {predictions_pce}')
predictions_hopt = predict_field_clustering(df, df_clustering_index, kmeans, k=3, field='hopt (%)')
print(f'Predictions hOPT: {predictions_hopt}')
Avg PCE cluster: [1.7782352941176471, 2.208725, 0.9611066247880435, 2.3942907096774193, 0.5521064117647059] Avg hOPT cluster: [3.122, 9.250588235294117, 7.5371304347826085, 4.159564102564102, 5.469478260869566] Predictions PCE: [(2.466666666666667, 2.9), (0.8433333333333334, 0.2), (1.9866666666666666, 1.96), (2.546666666666667, 2.9), (1.4166666666666667, 0.61), (0.7966666666666667, 0.61), (1.9217333333333333, 1.96), (0.07253333333333332, 0.0052), (1.96, 1.7), (5.4433333333333325, 4.29), (0.86, 0.2), (3.7266666666666666, 1.7), (2.63, 3.05), (2.9633333333333334, 2.94), (3.296666666666667, 3.1), (0.47333333333333333, 0.44), (0.47333333333333333, 0.44), (1.78, 1.7), (2.63, 3.05), (2.796666666666667, 2.39), (2.0033333333333334, 2.39), (1.0923333333333334, 0.77), (2.0366666666666666, 2.6), (0.11266666666666668, 0.117), (0.42333333333333334, 0.38), (2.42, 2.39), (1.7199666666666669, 2.3), (2.4, 1.96), (2.1672666666666665, 2.7), (1.7199666666666669, 2.3), (2.4, 1.96), (2.58, 1.94), (0.0011200000000000001, 0.00086), (0.015533333333333335, 0.0019), (2.2699999999999996, 1.94), (0.09355633333333334, 0.000471), (0.07253333333333332, 0.0052), (0.10832000000000001, 0.0441), (0.7966666666666667, 0.61), (0.09362, 0.08), (0.07253333333333332, 0.0052), (0.7966666666666667, 0.61), (1.21, 0.62), (0.8099666666666666, 0.08), (1.3350666666666668, 1.2), (1.23, 0.62), (0.8692000000000001, 0.0052), (1.8933333333333335, 2.6), (1.6499999999999997, 1.44), (1.5099999999999998, 1.13), (1.39, 1.7), (4.5566666666666675, 2.16), (2.466666666666667, 2.9), (1.1633333333333333, 0.62), (1.7666666666666666, 1.7), (1.623333333333333, 1.8), (0.779, 0.08), (1.3666666666666665, 1.7), (1.0566666666666666, 0.77), (2.23, 1.7), (4.140000000000001, 4.36)] Predictions hOPT: [(4.0, 3.9), (2.56, 2.58), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (11.866666666666667, 12.5), (4.566666666666666, 6.4), (4.366666666666667, 3.2), (34.669999999999995, 48.0), (4.566666666666666, 6.4), (5.433333333333333, 6.8), (7.099999999999999, 6.8), (4.653333333333333, 4.56), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (2.3066666666666666, 1.75), (2.3066666666666666, 1.75), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (3.233333333333333, 1.5), (3.7999999999999994, 3.3), (1.7266666666666666, 2.1), (1.53, 0.28), (1.553333333333333, 0.34), (3.86, 2.3), (1.4000000000000001, 1.2), (3.393333333333333, 2.1), (13.633333333333335, 2.7), (19.066666666666666, 19.0), (25.166666666666668, 19.0), (13.633333333333335, 2.7), (17.400000000000002, 3.2), (1.7166666666666668, 1.85), (3.2266666666666666, 1.6), (11.523333333333333, 4.56), (0.5976666666666667, 0.058), (5.8, 4.8), (2.8200000000000003, 3.51), (3.98, 3.2), (2.1833333333333336, 1.6), (3.5533333333333332, 3.2), (3.5533333333333332, 3.2), (3.61, 3.51)]
mean_k_preds_pce = [p[0] for p in predictions_pce]
median_k_preds_pce = [p[1] for p in predictions_pce]
mean_k_preds_hopt = [p[0] for p in predictions_hopt]
median_k_preds_hopt = [p[1] for p in predictions_hopt]
print(f'PCE')
missing = get_missing(df, field='PCE (%)')
for name, _, _ in models:
print(f'Model: {name}')
preds_pce = models_02_pce[name].predict(missing)
out_preds_pce = models_out_02_pce[name].predict(missing)
print(f'Mean')
diff_mean_preds = np.abs(preds_pce-mean_k_preds_pce)
diff_mean_out_preds = np.abs(out_preds_pce-mean_k_preds_pce)
ax = sns.violinplot(data=[diff_mean_preds, diff_mean_out_preds])
ax.set_xticklabels(['Baseline', 'No Outliers'])
fig = plt.gcf()
fig.savefig(f'figures/cluster_pce_{name}_mean.pdf', bbox_inches='tight')
plt.show()
#print(f'RMSE {rmse(preds_pce, mean_k_preds_pce)} {rmse(out_preds_pce, mean_k_preds_pce)}')
print(f'Median')
diff_median_preds = np.abs(preds_pce-median_k_preds_pce)
diff_median_out_preds = np.abs(out_preds_pce-median_k_preds_pce)
ax = sns.violinplot(data=[diff_median_preds, diff_median_out_preds])
ax.set_xticklabels(['Baseline', 'No Outliers'])
fig = plt.gcf()
fig.savefig(f'figures/cluster_pce_{name}_median.pdf', bbox_inches='tight')
plt.show()
#print(f'RMSE {rmse(preds_pce, median_k_preds_pce)} {rmse(out_preds_pce, median_k_preds_pce)}')
score_mae_mean = mean_absolute_error(preds_pce, mean_k_preds_pce)
score_mse_mean = mean_squared_error(preds_pce, mean_k_preds_pce)
score_r2_mean = r2_score(preds_pce, mean_k_preds_pce)
score_mae_median = mean_absolute_error(preds_pce, median_k_preds_pce)
score_mse_median = mean_squared_error(preds_pce, median_k_preds_pce)
score_r2_median = r2_score(preds_pce, median_k_preds_pce)
# print the results
print(f'Baseline')
print(f'| {"Aggregation":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
print(f'| ----------------- | ------ | ------ | ------ |')
print(f'| {"Mean":<17} | {round(score_mae_mean, 2):>6} | {round(score_mse_mean, 2):>6} | {round(score_r2_mean, 2):>6} |')
print(f'| {"Median":<17} | {round(score_mae_median, 2):>6} | {round(score_mse_median, 2):>6} | {round(score_r2_median, 2):>6} |')
print()
score_mae_mean_out = mean_absolute_error(out_preds_pce, mean_k_preds_pce)
score_mse_mean_out = mean_squared_error(out_preds_pce, mean_k_preds_pce)
score_r2_mean_out = r2_score(mean_k_preds_pce, out_preds_pce)
score_mae_median_out = mean_absolute_error(out_preds_pce, median_k_preds_pce)
score_mse_median_out = mean_squared_error(out_preds_pce, median_k_preds_pce)
score_r2_median_out = r2_score(median_k_preds_pce, out_preds_pce)
# print the results
print(f'No Outliers')
print(f'| {"Aggregation":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
print(f'| ----------------- | ------ | ------ | ------ |')
print(f'| {"Mean":<17} | {round(score_mae_mean_out, 2):>6} | {round(score_mse_mean_out, 2):>6} | {round(score_r2_mean_out, 2):>6} |')
print(f'| {"Median":<17} | {round(score_mae_median_out, 2):>6} | {round(score_mse_median_out, 2):>6} | {round(score_r2_median_out, 2):>6} |')
print()
PCE Model: Linear Regression Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 0.97 | 1.47 | -1.87 | | Median | 1.02 | 1.48 | -1.89 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 0.84 | 1.29 | 0.03 | | Median | 0.89 | 1.19 | 0.09 | Model: K Neighbors Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 0.8 | 1.35 | -0.2 | | Median | 0.84 | 1.36 | -0.21 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 0.68 | 0.99 | 0.26 | | Median | 0.74 | 0.81 | 0.38 | Model: Random Forest Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 0.86 | 1.13 | -1.23 | | Median | 0.91 | 1.24 | -1.45 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 0.79 | 1.24 | 0.07 | | Median | 0.87 | 1.1 | 0.16 | Model: Gradient Boosting Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 1.18 | 2.86 | -0.13 | | Median | 1.21 | 2.78 | -0.1 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 0.99 | 1.8 | -0.35 | | Median | 1.03 | 1.58 | -0.2 | Model: XGBoost Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 0.83 | 1.07 | -3.07 | | Median | 0.9 | 1.04 | -2.97 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 0.86 | 1.46 | -0.1 | | Median | 0.89 | 1.23 | 0.07 |
print(f'hOPT')
missing = get_missing(df, field='hopt (%)')
for name, _, _ in models:
print(f'Model: {name}')
preds_hopt = models_02_hopt[name].predict(missing)
out_preds_hopt = models_out_02_hopt[name].predict(missing)
print(f'Mean')
diff_mean_preds = np.abs(preds_hopt-mean_k_preds_hopt)
diff_mean_out_preds = np.abs(out_preds_hopt-mean_k_preds_hopt)
ax = sns.violinplot(data=[diff_mean_preds, diff_mean_out_preds])
ax.set_xticklabels(['Baseline', 'No Outliers'])
fig = plt.gcf()
fig.savefig(f'figures/cluster_hopt_{name}_mean.pdf', bbox_inches='tight')
plt.show()
#print(f'RMSE {rmse(preds_pce, mean_k_preds_pce)} {rmse(out_preds_pce, mean_k_preds_pce)}')
print(f'Median')
diff_median_preds = np.abs(preds_hopt-median_k_preds_hopt)
diff_median_out_preds = np.abs(out_preds_hopt-median_k_preds_hopt)
ax = sns.violinplot(data=[diff_median_preds, diff_median_out_preds])
ax.set_xticklabels(['Baseline', 'No Outliers'])
fig = plt.gcf()
fig.savefig(f'figures/cluster_hopt_{name}_median.pdf', bbox_inches='tight')
plt.show()
#print(f'RMSE {rmse(preds_pce, median_k_preds_pce)} {rmse(out_preds_pce, median_k_preds_pce)}')
score_mae_mean = mean_absolute_error(preds_hopt, mean_k_preds_hopt)
score_mse_mean = mean_squared_error(preds_hopt, mean_k_preds_hopt)
score_r2_mean = r2_score(preds_hopt, mean_k_preds_hopt)
score_mae_median = mean_absolute_error(preds_hopt, median_k_preds_hopt)
score_mse_median = mean_squared_error(preds_hopt, median_k_preds_hopt)
score_r2_median = r2_score(preds_hopt, median_k_preds_hopt)
# print the results
print(f'Baseline')
print(f'| {"Aggregation":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
print(f'| ----------------- | ------ | ------ | ------ |')
print(f'| {"Mean":<17} | {round(score_mae_mean, 2):>6} | {round(score_mse_mean, 2):>6} | {round(score_r2_mean, 2):>6} |')
print(f'| {"Median":<17} | {round(score_mae_median, 2):>6} | {round(score_mse_median, 2):>6} | {round(score_r2_median, 2):>6} |')
print()
score_mae_mean_out = mean_absolute_error(out_preds_hopt, mean_k_preds_hopt)
score_mse_mean_out = mean_squared_error(out_preds_hopt, mean_k_preds_hopt)
score_r2_mean_out = r2_score(mean_k_preds_hopt, out_preds_hopt)
score_mae_median_out = mean_absolute_error(out_preds_hopt, median_k_preds_hopt)
score_mse_median_out = mean_squared_error(out_preds_hopt, median_k_preds_hopt)
score_r2_median_out = r2_score(median_k_preds_hopt, out_preds_hopt)
# print the results
print(f'No Outliers')
print(f'| {"Aggregation":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
print(f'| ----------------- | ------ | ------ | ------ |')
print(f'| {"Mean":<17} | {round(score_mae_mean_out, 2):>6} | {round(score_mse_mean_out, 2):>6} | {round(score_r2_mean_out, 2):>6} |')
print(f'| {"Median":<17} | {round(score_mae_median_out, 2):>6} | {round(score_mse_median_out, 2):>6} | {round(score_r2_median_out, 2):>6} |')
print()
hOPT Model: Linear Regression Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 4.99 | 47.15 | -2.12 | | Median | 4.79 | 57.43 | -2.8 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 3.84 | 56.54 | -0.18 | | Median | 3.07 | 63.06 | -0.06 | Model: K Neighbors Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 2.65 | 23.39 | 0.49 | | Median | 1.98 | 15.15 | 0.67 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 3.67 | 57.3 | -0.19 | | Median | 2.77 | 62.26 | -0.04 | Model: Random Forest Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 3.52 | 39.02 | 0.46 | | Median | 2.41 | 21.55 | 0.7 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 3.59 | 58.64 | -0.22 | | Median | 2.87 | 64.29 | -0.08 | Model: Gradient Boosting Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 3.42 | 36.85 | 0.43 | | Median | 2.09 | 18.12 | 0.72 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 3.92 | 61.96 | -0.29 | | Median | 3.2 | 67.39 | -0.13 | Model: XGBoost Mean
Median
Baseline | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 2.95 | 28.03 | -0.29 | | Median | 2.31 | 19.48 | 0.1 | No Outliers | Aggregation | MAE | MSE | R2 | | ----------------- | ------ | ------ | ------ | | Mean | 3.79 | 59.47 | -0.24 | | Median | 2.93 | 64.6 | -0.08 |
df_per_cluster = get_clusters_df(df, df_clustering_index, labels)
for i, df in enumerate(df_per_cluster):
print(f'Cluster {i}')
print(f'{df}')
Cluster 0 shape: (25, 12) ┌─────┬──────┬───────────┬──────────┬───┬────────┬────────┬──────────┬─────────┐ │ # ┆ mat0 ┆ mat1 ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%) │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ str ┆ i64 ┆ ┆ i64 ┆ f64 ┆ f64 ┆ f64 │ ╞═════╪══════╪═══════════╪══════════╪═══╪════════╪════════╪══════════╪═════════╡ │ 8 ┆ QD ┆ solution ┆ 800 ┆ … ┆ 1000 ┆ 30.0 ┆ 1.4 ┆ 3.2 │ │ 14 ┆ dye ┆ solution ┆ 466 ┆ … ┆ 900 ┆ 11.0 ┆ 0.6 ┆ null │ │ 15 ┆ dye ┆ solution ┆ 557 ┆ … ┆ 900 ┆ 9.0 ┆ 0.5 ┆ null │ │ 26 ┆ dye ┆ film ┆ 745 ┆ … ┆ 900 ┆ 25.0 ┆ null ┆ 0.61 │ │ 27 ┆ dye ┆ film ┆ 745 ┆ … ┆ 900 ┆ 25.0 ┆ null ┆ 1.24 │ │ 28 ┆ dye ┆ film ┆ 745 ┆ … ┆ 900 ┆ 25.0 ┆ null ┆ 0.54 │ │ 29 ┆ dye ┆ film ┆ 745 ┆ … ┆ 900 ┆ 25.0 ┆ null ┆ 1.41 │ │ 34 ┆ NP ┆ bulk ┆ 550 ┆ … ┆ 1000 ┆ 80.0 ┆ 6.8 ┆ null │ │ 36 ┆ QD ┆ film ┆ 450 ┆ … ┆ 975 ┆ 40.0 ┆ null ┆ null │ │ 37 ┆ QD ┆ waveguide ┆ 650 ┆ … ┆ 850 ┆ 63.0 ┆ 1.75 ┆ null │ │ 38 ┆ QD ┆ solution ┆ 650 ┆ … ┆ 850 ┆ 63.0 ┆ 3.67 ┆ null │ │ 55 ┆ QD ┆ bulk ┆ 500 ┆ … ┆ 1000 ┆ 50.0 ┆ null ┆ 2.85 │ │ 57 ┆ QD ┆ film ┆ 450 ┆ … ┆ 1100 ┆ 91.0 ┆ 8.1 ┆ 2.94 │ │ 61 ┆ QD ┆ bulk ┆ 500 ┆ … ┆ 1000 ┆ 78.0 ┆ 6.4 ┆ 3.1 │ │ 62 ┆ QD ┆ bulk ┆ 640 ┆ … ┆ 1200 ┆ 40.0 ┆ null ┆ 3.27 │ │ 63 ┆ QD ┆ bulk ┆ 415 ┆ … ┆ 1240 ┆ 60.3 ┆ null ┆ 3.94 │ │ 72 ┆ dye ┆ film ┆ 760 ┆ … ┆ 900 ┆ 24.0 ┆ null ┆ 0.44 │ │ 73 ┆ dye ┆ film ┆ 760 ┆ … ┆ 900 ┆ 24.0 ┆ null ┆ 0.28 │ │ 74 ┆ dye ┆ film ┆ 700 ┆ … ┆ 900 ┆ 30.0 ┆ null ┆ 0.62 │ │ 75 ┆ dye ┆ film ┆ 700 ┆ … ┆ 900 ┆ 30.0 ┆ null ┆ 0.36 │ │ 76 ┆ dye ┆ film ┆ 738 ┆ … ┆ 900 ┆ 23.0 ┆ null ┆ 0.41 │ │ 77 ┆ dye ┆ film ┆ 738 ┆ … ┆ 900 ┆ 23.0 ┆ null ┆ 0.28 │ │ 79 ┆ QD ┆ bulk ┆ 763 ┆ … ┆ 1050 ┆ 70.0 ┆ null ┆ 4.74 │ │ 106 ┆ dye ┆ film ┆ 770 ┆ … ┆ 950 ┆ 16.0 ┆ 1.5 ┆ null │ │ 110 ┆ dye ┆ fiber ┆ 780 ┆ … ┆ 850 ┆ 21.0 ┆ 0.5 ┆ null │ └─────┴──────┴───────────┴──────────┴───┴────────┴────────┴──────────┴─────────┘ Cluster 1 shape: (46, 12) ┌─────┬─────────┬──────────┬──────────┬───┬────────┬────────┬──────────┬─────────┐ │ # ┆ mat0 ┆ mat1 ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%) │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ str ┆ i64 ┆ ┆ i64 ┆ f64 ┆ f64 ┆ f64 │ ╞═════╪═════════╪══════════╪══════════╪═══╪════════╪════════╪══════════╪═════════╡ │ 1 ┆ dye ┆ film ┆ 578 ┆ … ┆ 750 ┆ 98.0 ┆ 18.8 ┆ null │ │ 9 ┆ QD ┆ solution ┆ 600 ┆ … ┆ 700 ┆ 50.0 ┆ 0.5 ┆ 1.2 │ │ 11 ┆ QD ┆ solution ┆ 376 ┆ … ┆ 700 ┆ 50.0 ┆ 0.3 ┆ null │ │ 12 ┆ dye ┆ solution ┆ 413 ┆ … ┆ 800 ┆ 67.0 ┆ 3.4 ┆ null │ │ 13 ┆ dye ┆ solution ┆ 550 ┆ … ┆ 750 ┆ 95.0 ┆ 2.6 ┆ null │ │ 16 ┆ polymer ┆ solution ┆ 460 ┆ … ┆ 800 ┆ 45.0 ┆ 1.0 ┆ null │ │ 17 ┆ polymer ┆ solution ┆ 467 ┆ … ┆ 700 ┆ 48.0 ┆ 0.9 ┆ null │ │ 25 ┆ dye ┆ bulk ┆ 580 ┆ … ┆ 800 ┆ 100.0 ┆ null ┆ 0.0018 │ │ 30 ┆ dye ┆ bulk ┆ 525 ┆ … ┆ 800 ┆ 97.6 ┆ null ┆ 2.6 │ │ 51 ┆ NP ┆ film ┆ 500 ┆ … ┆ 600 ┆ 92.0 ┆ null ┆ null │ │ 56 ┆ QD ┆ bulk ┆ 500 ┆ … ┆ 700 ┆ 35.9 ┆ 1.45 ┆ null │ │ 68 ┆ dye ┆ bulk ┆ 478 ┆ … ┆ 800 ┆ 61.1 ┆ 22.0 ┆ null │ │ 69 ┆ dye ┆ bulk ┆ 513 ┆ … ┆ 800 ┆ 24.8 ┆ 3.3 ┆ null │ │ 71 ┆ dye ┆ bulk ┆ 473 ┆ … ┆ 800 ┆ 44.3 ┆ 24.7 ┆ null │ │ 84 ┆ Ln ┆ bulk ┆ 590 ┆ … ┆ 700 ┆ 65.0 ┆ null ┆ 11.3 │ │ 98 ┆ dye ┆ film ┆ 520 ┆ … ┆ 700 ┆ 78.0 ┆ 0.08 ┆ null │ │ 99 ┆ dye ┆ fiber ┆ 520 ┆ … ┆ 700 ┆ 78.0 ┆ 1.6 ┆ 0.0052 │ │ 100 ┆ dye ┆ fiber ┆ 520 ┆ … ┆ 700 ┆ 93.0 ┆ 8.0 ┆ 0.0024 │ │ 105 ┆ dye ┆ bulk ┆ 570 ┆ … ┆ 750 ┆ 80.0 ┆ 14.5 ┆ 2.16 │ │ 107 ┆ dye ┆ bulk ┆ 665 ┆ … ┆ 750 ┆ 12.0 ┆ 3.7 ┆ 0.1 │ │ 109 ┆ dye ┆ fiber ┆ 560 ┆ … ┆ 700 ┆ 95.0 ┆ 2.1 ┆ null │ │ 112 ┆ dye ┆ fiber ┆ 560 ┆ … ┆ 700 ┆ 95.0 ┆ null ┆ 0.21 │ │ 113 ┆ QD ┆ bulk ┆ 580 ┆ … ┆ 700 ┆ 10.0 ┆ null ┆ 2.1 │ │ 114 ┆ QD ┆ bulk ┆ 580 ┆ … ┆ 700 ┆ 60.0 ┆ null ┆ 2.1 │ │ 115 ┆ dye ┆ bulk ┆ 575 ┆ … ┆ 720 ┆ 95.0 ┆ null ┆ 3.3 │ │ 116 ┆ QD ┆ bulk ┆ 580 ┆ … ┆ 700 ┆ 10.0 ┆ null ┆ 2.1 │ │ 117 ┆ QD ┆ bulk ┆ 450 ┆ … ┆ 680 ┆ 44.0 ┆ null ┆ 2.8 │ │ 118 ┆ QD ┆ bulk ┆ 450 ┆ … ┆ 650 ┆ 86.0 ┆ 48.0 ┆ null │ │ 121 ┆ Ln ┆ bulk ┆ 530 ┆ … ┆ 750 ┆ 83.0 ┆ null ┆ 1.44 │ │ 122 ┆ QD ┆ bulk ┆ 473 ┆ … ┆ 700 ┆ 45.0 ┆ 1.0 ┆ null │ │ 124 ┆ QD ┆ bulk ┆ 470 ┆ … ┆ 700 ┆ 20.0 ┆ 2.01 ┆ null │ │ 125 ┆ dye ┆ bulk ┆ 521 ┆ … ┆ 700 ┆ 93.0 ┆ 54.0 ┆ null │ │ 128 ┆ dye ┆ film ┆ 545 ┆ … ┆ 800 ┆ 50.0 ┆ 12.5 ┆ null │ │ 129 ┆ dye ┆ solution ┆ 498 ┆ … ┆ 700 ┆ 30.0 ┆ 6.88 ┆ 0.27 │ │ 130 ┆ dye ┆ solution ┆ 569 ┆ … ┆ 750 ┆ 61.0 ┆ 2.58 ┆ null │ │ 134 ┆ CD ┆ bulk ┆ 720 ┆ … ┆ 600 ┆ 65.0 ┆ null ┆ 8.75 │ │ 136 ┆ CD ┆ film ┆ 557 ┆ … ┆ 700 ┆ 70.0 ┆ 2.3 ┆ null │ │ 138 ┆ dye ┆ bulk ┆ 575 ┆ … ┆ 700 ┆ 96.0 ┆ 37.7 ┆ null │ │ 142 ┆ dye ┆ bulk ┆ 575 ┆ … ┆ 720 ┆ 95.0 ┆ 19.0 ┆ 2.9 │ │ 143 ┆ QD ┆ bulk ┆ 480 ┆ … ┆ 700 ┆ 15.7 ┆ 3.2 ┆ 0.62 │ │ 146 ┆ QDdye ┆ bulk ┆ 500 ┆ … ┆ 750 ┆ 32.7 ┆ 1.0 ┆ null │ │ 164 ┆ CD ┆ film ┆ 555 ┆ … ┆ 800 ┆ 17.6 ┆ 3.0 ┆ 2.7 │ │ 165 ┆ CD ┆ film ┆ 450 ┆ … ┆ 800 ┆ 60.0 ┆ 4.3 ┆ 3.8 │ │ 171 ┆ CD ┆ film ┆ 410 ┆ … ┆ 750 ┆ 7.6 ┆ 2.77 ┆ 1.96 │ │ 173 ┆ QD ┆ bulk ┆ 600 ┆ … ┆ 700 ┆ 30.0 ┆ 2.7 ┆ 0.38 │ │ 200 ┆ dye ┆ solution ┆ 660 ┆ … ┆ 800 ┆ 31.0 ┆ 2.65 ┆ 0.21 │ └─────┴─────────┴──────────┴──────────┴───┴────────┴────────┴──────────┴─────────┘ Cluster 2 shape: (23, 12) ┌─────┬─────────┬──────────┬──────────┬───┬────────┬────────┬──────────┬──────────┐ │ # ┆ mat0 ┆ mat1 ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%) │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ str ┆ i64 ┆ ┆ i64 ┆ f64 ┆ f64 ┆ f64 │ ╞═════╪═════════╪══════════╪══════════╪═══╪════════╪════════╪══════════╪══════════╡ │ 177 ┆ Ln ┆ film ┆ 450 ┆ … ┆ 570 ┆ 44.0 ┆ 4.8 ┆ 0.054 │ │ 178 ┆ Ln ┆ film ┆ 400 ┆ … ┆ 470 ┆ 37.0 ┆ 7.7 ┆ 0.058 │ │ 179 ┆ CD ┆ film ┆ 425 ┆ … ┆ 400 ┆ 11.0 ┆ 13.1 ┆ 0.053 │ │ 180 ┆ Ln ┆ film ┆ 450 ┆ … ┆ 570 ┆ 44.0 ┆ 5.2 ┆ 0.046 │ │ 181 ┆ Ln ┆ film ┆ 400 ┆ … ┆ 470 ┆ 37.0 ┆ 7.7 ┆ 0.047 │ │ 182 ┆ CD ┆ film ┆ 425 ┆ … ┆ 400 ┆ 11.0 ┆ 12.8 ┆ 0.041 │ │ 183 ┆ Ln ┆ film ┆ 450 ┆ … ┆ 570 ┆ 59.0 ┆ 6.7 ┆ 0.074 │ │ 184 ┆ Ln ┆ film ┆ 400 ┆ … ┆ 470 ┆ 54.0 ┆ 8.5 ┆ 0.065 │ │ 185 ┆ Ln ┆ film ┆ 450 ┆ … ┆ 570 ┆ 59.0 ┆ 10.7 ┆ 0.096 │ │ 186 ┆ Ln ┆ film ┆ 400 ┆ … ┆ 470 ┆ 54.0 ┆ 8.7 ┆ 0.053 │ │ 187 ┆ Ln ┆ film ┆ 450 ┆ … ┆ 570 ┆ 59.0 ┆ 11.7 ┆ 0.142 │ │ 188 ┆ Ln ┆ film ┆ 400 ┆ … ┆ 470 ┆ 54.0 ┆ 16.5 ┆ 0.136 │ │ 189 ┆ dye ┆ film ┆ 650 ┆ … ┆ 600 ┆ 4.0 ┆ 2.6 ┆ 0.044 │ │ 190 ┆ polymer ┆ film ┆ 500 ┆ … ┆ 400 ┆ 47.0 ┆ 5.71 ┆ 2.29 │ │ 191 ┆ polymer ┆ film ┆ 500 ┆ … ┆ 450 ┆ 57.0 ┆ 9.112 ┆ 2.32 │ │ 192 ┆ polymer ┆ film ┆ 500 ┆ … ┆ 400 ┆ 68.0 ┆ 12.08 ┆ 2.47 │ │ 193 ┆ polymer ┆ film ┆ 500 ┆ … ┆ 400 ┆ 47.0 ┆ 5.71 ┆ 4.38 │ │ 194 ┆ polymer ┆ film ┆ 500 ┆ … ┆ 450 ┆ 57.0 ┆ 9.112 ┆ 4.62 │ │ 195 ┆ polymer ┆ film ┆ 500 ┆ … ┆ 400 ┆ 68.0 ┆ 12.08 ┆ 4.92 │ │ 196 ┆ NP ┆ bulk ┆ 600 ┆ … ┆ 490 ┆ 65.0 ┆ 0.15 ┆ 0.049413 │ │ 197 ┆ dye ┆ bulk ┆ 600 ┆ … ┆ 575 ┆ 70.0 ┆ 0.16 ┆ 0.050786 │ │ 198 ┆ NPdye ┆ bulk ┆ 600 ┆ … ┆ 500 ┆ 68.0 ┆ 0.22 ┆ 0.07531 │ │ 199 ┆ QD ┆ solution ┆ 510 ┆ … ┆ 475 ┆ 89.0 ┆ 2.32 ┆ 0.020944 │ └─────┴─────────┴──────────┴──────────┴───┴────────┴────────┴──────────┴──────────┘ Cluster 3 shape: (51, 12) ┌─────┬──────┬──────────┬──────────┬───┬────────┬────────┬──────────┬──────────┐ │ # ┆ mat0 ┆ mat1 ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%) │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ str ┆ i64 ┆ ┆ i64 ┆ f64 ┆ f64 ┆ f64 │ ╞═════╪══════╪══════════╪══════════╪═══╪════════╪════════╪══════════╪══════════╡ │ 18 ┆ dye ┆ film ┆ 403 ┆ … ┆ 600 ┆ 40.8 ┆ 7.7 ┆ null │ │ 19 ┆ dye ┆ bulk ┆ 374 ┆ … ┆ 600 ┆ 100.0 ┆ null ┆ null │ │ 23 ┆ dye ┆ bulk ┆ 340 ┆ … ┆ 700 ┆ 14.0 ┆ 0.25 ┆ null │ │ 24 ┆ dye ┆ bulk ┆ 340 ┆ … ┆ 600 ┆ 78.0 ┆ 0.4 ┆ null │ │ 43 ┆ CD ┆ film ┆ 350 ┆ … ┆ 650 ┆ 45.0 ┆ 12.0 ┆ null │ │ 44 ┆ CD ┆ film ┆ 358 ┆ … ┆ 600 ┆ 94.0 ┆ 3.9 ┆ null │ │ 46 ┆ CD ┆ bulk ┆ 340 ┆ … ┆ 700 ┆ 40.0 ┆ 0.92 ┆ null │ │ 47 ┆ CDQD ┆ tandem ┆ 450 ┆ … ┆ 700 ┆ 45.0 ┆ 1.4 ┆ null │ │ 49 ┆ CDQD ┆ bulk ┆ 400 ┆ … ┆ 550 ┆ 70.0 ┆ null ┆ 3.05 │ │ 52 ┆ NP ┆ bulk ┆ 325 ┆ … ┆ 525 ┆ 58.0 ┆ 2.4 ┆ 1.8 │ │ 54 ┆ CD ┆ bulk ┆ 440 ┆ … ┆ 700 ┆ 25.0 ┆ 1.2 ┆ null │ │ 58 ┆ QD ┆ bulk ┆ 350 ┆ … ┆ 775 ┆ 81.0 ┆ 26.5 ┆ 8.71 │ │ 65 ┆ QD ┆ bulk ┆ 360 ┆ … ┆ 700 ┆ 53.0 ┆ null ┆ null │ │ 82 ┆ dye ┆ bulk ┆ 450 ┆ … ┆ 600 ┆ 17.0 ┆ null ┆ 8.99 │ │ 88 ┆ Ln ┆ film ┆ 290 ┆ … ┆ 650 ┆ 40.0 ┆ 8.8 ┆ null │ │ 91 ┆ Ln ┆ film ┆ 325 ┆ … ┆ 650 ┆ 12.0 ┆ 1.7 ┆ null │ │ 123 ┆ QD ┆ bulk ┆ 350 ┆ … ┆ 750 ┆ 56.0 ┆ null ┆ 8.71 │ │ 126 ┆ CD ┆ film ┆ 450 ┆ … ┆ 700 ┆ 11.0 ┆ 7.58 ┆ 6.0 │ │ 131 ┆ dye ┆ solution ┆ 488 ┆ … ┆ 600 ┆ 51.0 ┆ 3.3 ┆ 0.35 │ │ 132 ┆ CD ┆ bulk ┆ 460 ┆ … ┆ 500 ┆ 54.0 ┆ 2.7 ┆ null │ │ 133 ┆ CD ┆ bulk ┆ 340 ┆ … ┆ 700 ┆ 6.0 ┆ 5.84 ┆ null │ │ 135 ┆ CD ┆ film ┆ 420 ┆ … ┆ 700 ┆ 40.0 ┆ 1.6 ┆ 0.7 │ │ 137 ┆ CD ┆ film ┆ 420 ┆ … ┆ 600 ┆ 67.0 ┆ 2.2 ┆ 1.13 │ │ 139 ┆ CD ┆ solution ┆ 491 ┆ … ┆ 700 ┆ 82.0 ┆ 5.43 ┆ 0.18 │ │ 140 ┆ CD ┆ film ┆ 510 ┆ … ┆ 650 ┆ 78.0 ┆ 0.058 ┆ 0.00083 │ │ 141 ┆ CD ┆ film ┆ 510 ┆ … ┆ 650 ┆ 78.0 ┆ 1.7 ┆ 0.014 │ │ 144 ┆ CD ┆ film ┆ 405 ┆ … ┆ 700 ┆ 70.0 ┆ 3.2 ┆ 1.9 │ │ 145 ┆ CD ┆ film ┆ 405 ┆ … ┆ 700 ┆ 65.0 ┆ 2.9 ┆ 1.7 │ │ 147 ┆ CD ┆ film ┆ 347 ┆ … ┆ 700 ┆ 61.0 ┆ 4.56 ┆ 4.1 │ │ 148 ┆ Ln ┆ bulk ┆ 405 ┆ … ┆ 700 ┆ 81.0 ┆ 3.4 ┆ 1.37 │ │ 149 ┆ CD ┆ film ┆ 490 ┆ … ┆ 650 ┆ 80.5 ┆ null ┆ 2.06 │ │ 150 ┆ CD ┆ film ┆ 490 ┆ … ┆ 650 ┆ 80.5 ┆ 4.8 ┆ 4.36 │ │ 151 ┆ CD ┆ film ┆ 380 ┆ … ┆ 700 ┆ 11.54 ┆ 1.36 ┆ null │ │ 152 ┆ QD ┆ film ┆ 300 ┆ … ┆ 550 ┆ 35.91 ┆ 3.08 ┆ null │ │ 154 ┆ CDQD ┆ film ┆ 320 ┆ … ┆ 700 ┆ 23.0 ┆ 1.89 ┆ null │ │ 155 ┆ CDQD ┆ film ┆ 320 ┆ … ┆ 700 ┆ 22.0 ┆ 2.54 ┆ null │ │ 156 ┆ CDQD ┆ film ┆ 320 ┆ … ┆ 700 ┆ 26.0 ┆ 3.76 ┆ null │ │ 157 ┆ CD ┆ film ┆ 350 ┆ … ┆ 750 ┆ 35.0 ┆ null ┆ 1.9 │ │ 158 ┆ CD ┆ film ┆ 380 ┆ … ┆ 700 ┆ 35.0 ┆ null ┆ 1.7 │ │ 159 ┆ CD ┆ film ┆ 370 ┆ … ┆ 775 ┆ 35.0 ┆ null ┆ 2.3 │ │ 160 ┆ CD ┆ film ┆ 400 ┆ … ┆ 700 ┆ 33.0 ┆ 4.5 ┆ 0.117 │ │ 161 ┆ CD ┆ bulk ┆ 400 ┆ … ┆ 700 ┆ 41.0 ┆ 5.89 ┆ 0.16 │ │ 162 ┆ CD ┆ bulk ┆ 400 ┆ … ┆ 700 ┆ 41.0 ┆ 3.13 ┆ 0.061 │ │ 168 ┆ CD ┆ bulk ┆ 470 ┆ … ┆ 650 ┆ 9.6 ┆ 9.3 ┆ null │ │ 169 ┆ CD ┆ film ┆ 380 ┆ … ┆ 700 ┆ 41.52 ┆ 3.51 ┆ 2.39 │ │ 170 ┆ CD ┆ film ┆ 355 ┆ … ┆ 650 ┆ 15.01 ┆ 2.76 ┆ 1.94 │ │ 172 ┆ CD ┆ film ┆ 400 ┆ … ┆ 750 ┆ 22.0 ┆ 4.03 ┆ 2.92 │ │ 174 ┆ QD ┆ bulk ┆ 350 ┆ … ┆ 700 ┆ 91.0 ┆ null ┆ 4.29 │ │ 175 ┆ QD ┆ bulk ┆ 350 ┆ … ┆ 700 ┆ 91.0 ┆ null ┆ 0.55 │ │ 176 ┆ QD ┆ bulk ┆ 350 ┆ … ┆ 700 ┆ 11.0 ┆ null ┆ 0.77 │ │ 202 ┆ CD ┆ film ┆ 510 ┆ … ┆ 650 ┆ 78.0 ┆ 0.035 ┆ 0.000182 │ └─────┴──────┴──────────┴──────────┴───┴────────┴────────┴──────────┴──────────┘ Cluster 4 shape: (28, 12) ┌─────┬──────┬───────┬──────────┬───┬────────┬────────┬──────────┬──────────┐ │ # ┆ mat0 ┆ mat1 ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%) │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ str ┆ str ┆ i64 ┆ ┆ i64 ┆ f64 ┆ f64 ┆ f64 │ ╞═════╪══════╪═══════╪══════════╪═══╪════════╪════════╪══════════╪══════════╡ │ 20 ┆ dye ┆ bulk ┆ 370 ┆ … ┆ 750 ┆ 67.0 ┆ 5.5 ┆ null │ │ 31 ┆ NP ┆ bulk ┆ 375 ┆ … ┆ 900 ┆ 45.0 ┆ 4.25 ┆ 1.33 │ │ 39 ┆ QD ┆ film ┆ 480 ┆ … ┆ 670 ┆ 36.2 ┆ 2.95 ┆ 2.25 │ │ 50 ┆ NP ┆ film ┆ 400 ┆ … ┆ 700 ┆ 25.0 ┆ 1.85 ┆ null │ │ 67 ┆ dye ┆ bulk ┆ 446 ┆ … ┆ 800 ┆ 89.5 ┆ 31.3 ┆ null │ │ 70 ┆ dye ┆ bulk ┆ 449 ┆ … ┆ 800 ┆ 80.0 ┆ 27.8 ┆ null │ │ 80 ┆ dye ┆ bulk ┆ 491 ┆ … ┆ 650 ┆ 95.0 ┆ 23.7 ┆ 2.81 │ │ 85 ┆ Ln ┆ film ┆ 380 ┆ … ┆ 710 ┆ 30.5 ┆ 0.34 ┆ 0.0019 │ │ 86 ┆ Ln ┆ film ┆ 360 ┆ … ┆ 700 ┆ 1.6 ┆ 0.27 ┆ 0.00078 │ │ 87 ┆ Ln ┆ film ┆ 370 ┆ … ┆ 710 ┆ 27.0 ┆ 3.2 ┆ 0.007 │ │ 89 ┆ Ln ┆ film ┆ 350 ┆ … ┆ 710 ┆ 34.0 ┆ 4.3 ┆ null │ │ 90 ┆ Ln ┆ film ┆ 325 ┆ … ┆ 710 ┆ 8.0 ┆ 1.2 ┆ null │ │ 93 ┆ Ln ┆ film ┆ 370 ┆ … ┆ 710 ┆ 63.0 ┆ 9.0 ┆ null │ │ 94 ┆ Ln ┆ film ┆ 370 ┆ … ┆ 710 ┆ 61.0 ┆ 1.2 ┆ 0.2 │ │ 95 ┆ Ln ┆ film ┆ 380 ┆ … ┆ 710 ┆ 23.0 ┆ 0.43 ┆ 0.03 │ │ 96 ┆ Ln ┆ film ┆ 360 ┆ … ┆ 710 ┆ 30.0 ┆ 0.01 ┆ 0.0006 │ │ 97 ┆ Ln ┆ fiber ┆ 360 ┆ … ┆ 710 ┆ 85.0 ┆ 2.3 ┆ 0.00086 │ │ 101 ┆ Ln ┆ film ┆ 340 ┆ … ┆ 710 ┆ 44.0 ┆ null ┆ 0.0441 │ │ 102 ┆ Ln ┆ film ┆ 405 ┆ … ┆ 710 ┆ 44.0 ┆ null ┆ 0.0499 │ │ 103 ┆ Ln ┆ film ┆ 350 ┆ … ┆ 710 ┆ 73.0 ┆ 0.28 ┆ 0.28 │ │ 104 ┆ Ln ┆ film ┆ 320 ┆ … ┆ 710 ┆ 86.0 ┆ null ┆ null │ │ 108 ┆ Ln ┆ fiber ┆ 370 ┆ … ┆ 710 ┆ 89.0 ┆ 0.7 ┆ null │ │ 111 ┆ Ln ┆ fiber ┆ 370 ┆ … ┆ 710 ┆ 89.0 ┆ null ┆ 0.08 │ │ 120 ┆ QD ┆ film ┆ 396 ┆ … ┆ 700 ┆ 53.0 ┆ null ┆ null │ │ 153 ┆ QD ┆ film ┆ 320 ┆ … ┆ 700 ┆ 32.97 ┆ 2.55 ┆ null │ │ 163 ┆ CD ┆ film ┆ 404 ┆ … ┆ 750 ┆ 86.4 ┆ 2.6 ┆ 2.3 │ │ 201 ┆ Ln ┆ film ┆ 370 ┆ … ┆ 720 ┆ 60.0 ┆ 0.02 ┆ 0.000198 │ │ 203 ┆ Ln ┆ film ┆ 370 ┆ … ┆ 720 ┆ 60.0 ┆ 0.048 ┆ 0.000471 │ └─────┴──────┴───────┴──────────┴───┴────────┴────────┴──────────┴──────────┘
plot_stats_per_cluster(df, df_clustering_index, labels)
Cluster: 0
Cluster: 1
Cluster: 2
Cluster: 3
Cluster: 4
X = df[['abs_peak', 'abs_min', 'abs_max', 'em_peak', 'em_min', 'em_max', 'QY (%)', 'hopt (%)', 'PCE (%)']].to_numpy()
X[np.isnan(X)] = 0
X
array([[3.700e+02, 3.000e+02, 4.500e+02, 6.500e+02, 4.500e+02, 7.500e+02,
6.700e+01, 5.500e+00, 0.000e+00],
[3.750e+02, 3.000e+02, 4.500e+02, 7.500e+02, 4.000e+02, 9.000e+02,
4.500e+01, 4.250e+00, 1.330e+00],
[4.800e+02, 3.000e+02, 5.000e+02, 6.190e+02, 5.700e+02, 6.700e+02,
3.620e+01, 2.950e+00, 2.250e+00],
[4.000e+02, 3.000e+02, 5.000e+02, 6.000e+02, 5.000e+02, 7.000e+02,
2.500e+01, 1.850e+00, 0.000e+00],
[4.460e+02, 2.500e+02, 5.000e+02, 5.530e+02, 5.000e+02, 8.000e+02,
8.950e+01, 3.130e+01, 0.000e+00],
[4.490e+02, 2.500e+02, 5.500e+02, 5.710e+02, 5.000e+02, 8.000e+02,
8.000e+01, 2.780e+01, 0.000e+00],
[4.910e+02, 3.000e+02, 5.000e+02, 5.810e+02, 5.500e+02, 6.500e+02,
9.500e+01, 2.370e+01, 2.810e+00],
[3.800e+02, 2.500e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.100e+02,
3.050e+01, 3.400e-01, 1.900e-03],
[3.600e+02, 2.500e+02, 3.800e+02, 5.450e+02, 4.500e+02, 7.000e+02,
1.600e+00, 2.700e-01, 7.800e-04],
[3.700e+02, 2.900e+02, 3.800e+02, 6.110e+02, 5.700e+02, 7.100e+02,
2.700e+01, 3.200e+00, 7.000e-03],
[3.500e+02, 2.400e+02, 4.200e+02, 6.120e+02, 5.700e+02, 7.100e+02,
3.400e+01, 4.300e+00, 0.000e+00],
[3.250e+02, 2.400e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.100e+02,
8.000e+00, 1.200e+00, 0.000e+00],
[3.700e+02, 2.400e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.100e+02,
6.300e+01, 9.000e+00, 0.000e+00],
[3.700e+02, 3.000e+02, 3.800e+02, 6.150e+02, 5.700e+02, 7.100e+02,
6.100e+01, 1.200e+00, 2.000e-01],
[3.800e+02, 2.400e+02, 3.800e+02, 6.120e+02, 5.700e+02, 7.100e+02,
2.300e+01, 4.300e-01, 3.000e-02],
[3.600e+02, 2.400e+02, 3.800e+02, 6.120e+02, 5.700e+02, 7.100e+02,
3.000e+01, 1.000e-02, 6.000e-04],
[3.600e+02, 2.400e+02, 4.200e+02, 6.150e+02, 5.700e+02, 7.100e+02,
8.500e+01, 2.300e+00, 8.600e-04],
[3.400e+02, 2.500e+02, 3.800e+02, 6.130e+02, 5.700e+02, 7.100e+02,
4.400e+01, 0.000e+00, 4.410e-02],
[4.050e+02, 2.500e+02, 4.200e+02, 6.130e+02, 5.700e+02, 7.100e+02,
4.400e+01, 0.000e+00, 4.990e-02],
[3.500e+02, 2.500e+02, 4.000e+02, 6.130e+02, 5.700e+02, 7.100e+02,
7.300e+01, 2.800e-01, 2.800e-01],
[3.200e+02, 2.500e+02, 3.500e+02, 6.130e+02, 5.700e+02, 7.100e+02,
8.600e+01, 0.000e+00, 0.000e+00],
[3.700e+02, 3.000e+02, 4.500e+02, 6.150e+02, 5.700e+02, 7.100e+02,
8.900e+01, 7.000e-01, 0.000e+00],
[3.700e+02, 3.000e+02, 4.500e+02, 6.150e+02, 5.700e+02, 7.100e+02,
8.900e+01, 0.000e+00, 8.000e-02],
[3.960e+02, 3.500e+02, 4.500e+02, 5.820e+02, 5.000e+02, 7.000e+02,
5.300e+01, 0.000e+00, 0.000e+00],
[3.200e+02, 3.000e+02, 5.500e+02, 6.700e+02, 6.000e+02, 7.000e+02,
3.297e+01, 2.550e+00, 0.000e+00],
[4.040e+02, 3.000e+02, 5.500e+02, 5.940e+02, 5.000e+02, 7.500e+02,
8.640e+01, 2.600e+00, 2.300e+00],
[3.700e+02, 2.500e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.200e+02,
6.000e+01, 2.000e-02, 1.980e-04],
[3.700e+02, 2.500e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.200e+02,
6.000e+01, 4.800e-02, 4.710e-04]])
# Get the Original Values
QY = X.T[6]
hopt = X.T[7]
PCE = X.T[8]
# Get the Masks for the non-zero values
QY_MASK = QY != 0
hopt_MASK = hopt != 0
PCE_MASK = PCE != 0
# Convert boolean mask to index one
QY_MASK = [i for i in range(len(QY_MASK)) if QY_MASK[i]]
hopt_MASK = [i for i in range(len(hopt_MASK)) if hopt_MASK[i]]
PCE_MASK = [i for i in range(len(PCE_MASK)) if PCE_MASK[i]]
# Compute the average and standard deviation
QY_AVG = np.average(QY[QY_MASK])
QY_STD = np.std(QY[QY_MASK])
hopt_AVG = np.average(hopt[hopt_MASK])
hopt_STD = np.std(hopt[hopt_MASK])
PCE_AVG = np.average(PCE[PCE_MASK])
PCE_STD = np.std(PCE[PCE_MASK])
test = .3
random.seed(42)
# Select the index of the test set
QY_TEST_SET_INDEX = random.sample(QY_MASK, int(len(QY_MASK)*test))
hopt_TEST_SET_INDEX = random.sample(hopt_MASK, int(len(hopt_MASK)*test))
PCE_TEST_SET_INDEX = random.sample(PCE_MASK, int(len(PCE_MASK)*test))
print(f'{QY_TEST_SET_INDEX}\n{hopt_TEST_SET_INDEX}\n{PCE_TEST_SET_INDEX}')
[20, 3, 0, 23, 8, 7, 22, 4] [3, 26, 19, 2, 21, 13] [2, 1, 27, 7, 22]
# Store original values from the test set
QY_TEST_SET = QY[QY_TEST_SET_INDEX]
hopt_TEST_SET = hopt[hopt_TEST_SET_INDEX]
PCE_TEST_SET = PCE[PCE_TEST_SET_INDEX]
print(f'{QY_TEST_SET}\n{hopt_TEST_SET}\n{PCE_TEST_SET}')
# Replace the test set with zeros
QY[QY_TEST_SET_INDEX] = 0
hopt[hopt_TEST_SET_INDEX] = 0
PCE[PCE_TEST_SET_INDEX] = 0
print(f'{QY[QY_TEST_SET_INDEX]}\n{hopt[hopt_TEST_SET_INDEX]}\n{PCE[PCE_TEST_SET_INDEX]}')
[86. 25. 67. 53. 1.6 30.5 89. 89.5] [1.85 0.02 0.28 2.95 0.7 1.2 ] [2.25e+00 1.33e+00 4.71e-04 1.90e-03 8.00e-02] [0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.]
# Write on the X Matrix again
X.T[6] = QY
X.T[7] = hopt
X.T[8] = PCE
# Grid Search
ks = [2,3,4,5,6,7,8]
seeds = [53, 59, 61, 67, 71, 73, 79, 83, 89, 97]
# Store best solution (min cost)
min_cost = float('inf')
solution = None
for k in ks:
for s in seeds:
Xr, W, H, cost = nmf.nmf_mu(X, k=k, seed=s)
if cost < min_cost:
min_cost = cost
solution = (k, s)
print(f"{cost} {solution}")
49.7770480742326 (8, 59)
# Measure the performance of the reconstruction
Xr, W, H, cost = nmf.nmf_mu(X, k=solution[0], seed=solution[1])
# Get the predicted QY, hopt and PCE
QY = Xr.T[6]
hopt = Xr.T[7]
PCE = Xr.T[8]
QY_PREDICT_SET = QY[QY_TEST_SET_INDEX]
hopt_PREDICT_SET = hopt[hopt_TEST_SET_INDEX]
PCE_PREDICT_SET = PCE[PCE_TEST_SET_INDEX]
print(f'{QY_PREDICT_SET}\n{hopt_PREDICT_SET}\n{PCE_PREDICT_SET}')
[15.97727897 62.32648612 90.01184508 19.35966872 59.88768401 72.13344412 18.33772585 46.85491737] [12.57010428 4.919815 1.98392266 24.25932488 2.95390967 1.06247509] [1.83368088 0.07531809 0.58336117 0.60496385 0.67724153]
rmse_qy = math.sqrt(mean_squared_error(QY_TEST_SET, QY_PREDICT_SET))
rmse_hopt = math.sqrt(mean_squared_error(hopt_TEST_SET, hopt_PREDICT_SET))
rmse_PCE = math.sqrt(mean_squared_error(PCE_TEST_SET, PCE_PREDICT_SET))
print(f'RMSE(QY): {rmse_qy} ({QY_AVG} ± {QY_STD})')
print(f'RMSE(HOPT): {rmse_hopt} ({hopt_AVG} ± {hopt_STD})')
print(f'RMSE(PCE): {rmse_PCE} ({PCE_AVG} ± {PCE_STD})')
RMSE(QY): 49.875122512813185 (54.22035714285715 ± 26.58863969041543) RMSE(HOPT): 10.008484102517194 (5.469478260869565 ± 8.892388483707826) RMSE(PCE): 0.7493587531423203 (0.5521064117647059 ± 0.9382626018509556)
# Grid Search
ks = [2,3,4,5,6,7,8]
seeds = [53, 59, 61, 67, 71, 73, 79, 83, 89, 97]
# Store best solution (min cost)
min_cost = float('inf')
solution = None
for k in ks:
for s in seeds:
Xr, _, _, _ = nmf.nmf_mu(X, k=k, seed=s)
# Get the values with the mask
hopt = Xr.T[7]
PCE = Xr.T[8]
hopt_PREDICT_SET = hopt[hopt_TEST_SET_INDEX]
PCE_PREDICT_SET = PCE[PCE_TEST_SET_INDEX]
# Compute the cost of the test set
rmse_hopt = math.sqrt(mean_squared_error(hopt_TEST_SET, hopt_PREDICT_SET))
rmse_PCE = math.sqrt(mean_squared_error(PCE_TEST_SET, PCE_PREDICT_SET))
#cost = rmse_hopt + rmse_PCE
#cost = rmse_hopt
cost = rmse_PCE
if cost < min_cost:
min_cost = cost
solution = (k, s)
print(f"{cost} {solution}")
0.6433135652975813 (6, 83)
# Measure the performance of the reconstruction
Xr, W, H, cost = nmf.nmf_mu(X, k=solution[0], seed=solution[1])
# Get the predicted QY, hopt and PCE
QY = Xr.T[6]
hopt = Xr.T[7]
PCE = Xr.T[8]
QY_PREDICT_SET = QY[QY_TEST_SET_INDEX]
hopt_PREDICT_SET = hopt[hopt_TEST_SET_INDEX]
PCE_PREDICT_SET = PCE[PCE_TEST_SET_INDEX]
print(f'{QY_PREDICT_SET}\n{hopt_PREDICT_SET}\n{PCE_PREDICT_SET}')
[ 37.75873316 80.51076649 62.08184412 143.49604739 73.47495419 43.23225725 94.97112549 94.22130926] [11.68839014 5.07296792 4.07583527 19.44763593 5.26625266 4.70790051] [1.9742166 2.1266637 0.3299664 0.51122061 0.54863358]
rmse_qy = math.sqrt(mean_squared_error(QY_TEST_SET, QY_PREDICT_SET))
rmse_hopt = math.sqrt(mean_squared_error(hopt_TEST_SET, hopt_PREDICT_SET))
rmse_PCE = math.sqrt(mean_squared_error(PCE_TEST_SET, PCE_PREDICT_SET))
print(f'RMSE(QY): {rmse_qy} ({QY_AVG} ± {QY_STD})')
print(f'RMSE(HOPT): {rmse_hopt} ({hopt_AVG} ± {hopt_STD})')
print(f'RMSE(PCE): {rmse_PCE} ({PCE_AVG} ± {PCE_STD})')
RMSE(QY): 48.74481047927994 (54.22035714285715 ± 26.58863969041543) RMSE(HOPT): 8.583538425748353 (5.469478260869565 ± 8.892388483707826) RMSE(PCE): 0.5095726953252181 (0.5521064117647059 ± 0.9382626018509556)
# Get the original matrix
X = df[['abs_peak', 'abs_min', 'abs_max', 'em_peak', 'em_min', 'em_max', 'QY (%)', 'hopt (%)', 'PCE (%)']].to_numpy()
X[np.isnan(X)] = 0
print(X.shape)
# Get the reconstructed matrix
Xr, _, _, _ = nmf.nmf_mu(X, k=solution[0], seed=solution[1])
# Replace all zeros with reconstructed values
X[X==0] = Xr[X==0]
(28, 9)
X
array([[3.70000000e+02, 3.00000000e+02, 4.50000000e+02, 6.50000000e+02,
4.50000000e+02, 7.50000000e+02, 6.70000000e+01, 5.50000000e+00,
1.30557497e+00],
[3.75000000e+02, 3.00000000e+02, 4.50000000e+02, 7.50000000e+02,
4.00000000e+02, 9.00000000e+02, 4.50000000e+01, 4.25000000e+00,
1.33000000e+00],
[4.80000000e+02, 3.00000000e+02, 5.00000000e+02, 6.19000000e+02,
5.70000000e+02, 6.70000000e+02, 3.62000000e+01, 2.95000000e+00,
2.25000000e+00],
[4.00000000e+02, 3.00000000e+02, 5.00000000e+02, 6.00000000e+02,
5.00000000e+02, 7.00000000e+02, 2.50000000e+01, 1.85000000e+00,
1.78174879e+00],
[4.46000000e+02, 2.50000000e+02, 5.00000000e+02, 5.53000000e+02,
5.00000000e+02, 8.00000000e+02, 8.95000000e+01, 3.13000000e+01,
1.14920925e+00],
[4.49000000e+02, 2.50000000e+02, 5.50000000e+02, 5.71000000e+02,
5.00000000e+02, 8.00000000e+02, 8.00000000e+01, 2.78000000e+01,
1.69867716e+00],
[4.91000000e+02, 3.00000000e+02, 5.00000000e+02, 5.81000000e+02,
5.50000000e+02, 6.50000000e+02, 9.50000000e+01, 2.37000000e+01,
2.81000000e+00],
[3.80000000e+02, 2.50000000e+02, 4.00000000e+02, 6.12000000e+02,
5.70000000e+02, 7.10000000e+02, 3.05000000e+01, 3.40000000e-01,
1.90000000e-03],
[3.60000000e+02, 2.50000000e+02, 3.80000000e+02, 5.45000000e+02,
4.50000000e+02, 7.00000000e+02, 1.60000000e+00, 2.70000000e-01,
7.80000000e-04],
[3.70000000e+02, 2.90000000e+02, 3.80000000e+02, 6.11000000e+02,
5.70000000e+02, 7.10000000e+02, 2.70000000e+01, 3.20000000e+00,
7.00000000e-03],
[3.50000000e+02, 2.40000000e+02, 4.20000000e+02, 6.12000000e+02,
5.70000000e+02, 7.10000000e+02, 3.40000000e+01, 4.30000000e+00,
8.43871139e-01],
[3.25000000e+02, 2.40000000e+02, 4.00000000e+02, 6.12000000e+02,
5.70000000e+02, 7.10000000e+02, 8.00000000e+00, 1.20000000e+00,
8.08810112e-01],
[3.70000000e+02, 2.40000000e+02, 4.00000000e+02, 6.12000000e+02,
5.70000000e+02, 7.10000000e+02, 6.30000000e+01, 9.00000000e+00,
4.13560588e-01],
[3.70000000e+02, 3.00000000e+02, 3.80000000e+02, 6.15000000e+02,
5.70000000e+02, 7.10000000e+02, 6.10000000e+01, 1.20000000e+00,
2.00000000e-01],
[3.80000000e+02, 2.40000000e+02, 3.80000000e+02, 6.12000000e+02,
5.70000000e+02, 7.10000000e+02, 2.30000000e+01, 4.30000000e-01,
3.00000000e-02],
[3.60000000e+02, 2.40000000e+02, 3.80000000e+02, 6.12000000e+02,
5.70000000e+02, 7.10000000e+02, 3.00000000e+01, 1.00000000e-02,
6.00000000e-04],
[3.60000000e+02, 2.40000000e+02, 4.20000000e+02, 6.15000000e+02,
5.70000000e+02, 7.10000000e+02, 8.50000000e+01, 2.30000000e+00,
8.60000000e-04],
[3.40000000e+02, 2.50000000e+02, 3.80000000e+02, 6.13000000e+02,
5.70000000e+02, 7.10000000e+02, 4.40000000e+01, 4.95152985e+00,
4.41000000e-02],
[4.05000000e+02, 2.50000000e+02, 4.20000000e+02, 6.13000000e+02,
5.70000000e+02, 7.10000000e+02, 4.40000000e+01, 5.16163221e+00,
4.99000000e-02],
[3.50000000e+02, 2.50000000e+02, 4.00000000e+02, 6.13000000e+02,
5.70000000e+02, 7.10000000e+02, 7.30000000e+01, 2.80000000e-01,
2.80000000e-01],
[3.20000000e+02, 2.50000000e+02, 3.50000000e+02, 6.13000000e+02,
5.70000000e+02, 7.10000000e+02, 8.60000000e+01, 6.49216118e+00,
1.22326850e-01],
[3.70000000e+02, 3.00000000e+02, 4.50000000e+02, 6.15000000e+02,
5.70000000e+02, 7.10000000e+02, 8.90000000e+01, 7.00000000e-01,
6.61502043e-01],
[3.70000000e+02, 3.00000000e+02, 4.50000000e+02, 6.15000000e+02,
5.70000000e+02, 7.10000000e+02, 8.90000000e+01, 9.63044260e+00,
8.00000000e-02],
[3.96000000e+02, 3.50000000e+02, 4.50000000e+02, 5.82000000e+02,
5.00000000e+02, 7.00000000e+02, 5.30000000e+01, 8.78474482e+00,
1.08010617e+00],
[3.20000000e+02, 3.00000000e+02, 5.50000000e+02, 6.70000000e+02,
6.00000000e+02, 7.00000000e+02, 3.29700000e+01, 2.55000000e+00,
1.99733817e+00],
[4.04000000e+02, 3.00000000e+02, 5.50000000e+02, 5.94000000e+02,
5.00000000e+02, 7.50000000e+02, 8.64000000e+01, 2.60000000e+00,
2.30000000e+00],
[3.70000000e+02, 2.50000000e+02, 4.00000000e+02, 6.12000000e+02,
5.70000000e+02, 7.20000000e+02, 6.00000000e+01, 2.00000000e-02,
1.98000000e-04],
[3.70000000e+02, 2.50000000e+02, 4.00000000e+02, 6.12000000e+02,
5.70000000e+02, 7.20000000e+02, 6.00000000e+01, 4.80000000e-02,
4.71000000e-04]])
Xr, W, H, cost = nmf.nmf_mu(X, k=solution[0], seed=solution[1])
_, cols = X.shape
field_names = df.columns
print(f'{field_names}')
df = pl.DataFrame({field_names[i+2]: X[:,i] for i in range(cols)})
df
['#', 'mat0', 'mat1', 'abs_peak', 'abs_min', 'abs_max', 'em_peak', 'em_min', 'em_max', 'QY (%)', 'hopt (%)', 'PCE (%)']
| mat1 | abs_peak | abs_min | abs_max | em_peak | em_min | em_max | QY (%) | hopt (%) |
|---|---|---|---|---|---|---|---|---|
| f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| 370.0 | 300.0 | 450.0 | 650.0 | 450.0 | 750.0 | 67.0 | 5.5 | 1.305575 |
| 375.0 | 300.0 | 450.0 | 750.0 | 400.0 | 900.0 | 45.0 | 4.25 | 1.33 |
| 480.0 | 300.0 | 500.0 | 619.0 | 570.0 | 670.0 | 36.2 | 2.95 | 2.25 |
| 400.0 | 300.0 | 500.0 | 600.0 | 500.0 | 700.0 | 25.0 | 1.85 | 1.781749 |
| 446.0 | 250.0 | 500.0 | 553.0 | 500.0 | 800.0 | 89.5 | 31.3 | 1.149209 |
| 449.0 | 250.0 | 550.0 | 571.0 | 500.0 | 800.0 | 80.0 | 27.8 | 1.698677 |
| 491.0 | 300.0 | 500.0 | 581.0 | 550.0 | 650.0 | 95.0 | 23.7 | 2.81 |
| 380.0 | 250.0 | 400.0 | 612.0 | 570.0 | 710.0 | 30.5 | 0.34 | 0.0019 |
| 360.0 | 250.0 | 380.0 | 545.0 | 450.0 | 700.0 | 1.6 | 0.27 | 0.00078 |
| 370.0 | 290.0 | 380.0 | 611.0 | 570.0 | 710.0 | 27.0 | 3.2 | 0.007 |
| 350.0 | 240.0 | 420.0 | 612.0 | 570.0 | 710.0 | 34.0 | 4.3 | 0.843871 |
| 325.0 | 240.0 | 400.0 | 612.0 | 570.0 | 710.0 | 8.0 | 1.2 | 0.80881 |
| 370.0 | 240.0 | 400.0 | 612.0 | 570.0 | 710.0 | 63.0 | 9.0 | 0.413561 |
| 370.0 | 300.0 | 380.0 | 615.0 | 570.0 | 710.0 | 61.0 | 1.2 | 0.2 |
| 380.0 | 240.0 | 380.0 | 612.0 | 570.0 | 710.0 | 23.0 | 0.43 | 0.03 |
| 360.0 | 240.0 | 380.0 | 612.0 | 570.0 | 710.0 | 30.0 | 0.01 | 0.0006 |
| 360.0 | 240.0 | 420.0 | 615.0 | 570.0 | 710.0 | 85.0 | 2.3 | 0.00086 |
| 340.0 | 250.0 | 380.0 | 613.0 | 570.0 | 710.0 | 44.0 | 4.95153 | 0.0441 |
| 405.0 | 250.0 | 420.0 | 613.0 | 570.0 | 710.0 | 44.0 | 5.161632 | 0.0499 |
| 350.0 | 250.0 | 400.0 | 613.0 | 570.0 | 710.0 | 73.0 | 0.28 | 0.28 |
| 320.0 | 250.0 | 350.0 | 613.0 | 570.0 | 710.0 | 86.0 | 6.492161 | 0.122327 |
| 370.0 | 300.0 | 450.0 | 615.0 | 570.0 | 710.0 | 89.0 | 0.7 | 0.661502 |
| 370.0 | 300.0 | 450.0 | 615.0 | 570.0 | 710.0 | 89.0 | 9.630443 | 0.08 |
| 396.0 | 350.0 | 450.0 | 582.0 | 500.0 | 700.0 | 53.0 | 8.784745 | 1.080106 |
| 320.0 | 300.0 | 550.0 | 670.0 | 600.0 | 700.0 | 32.97 | 2.55 | 1.997338 |
| 404.0 | 300.0 | 550.0 | 594.0 | 500.0 | 750.0 | 86.4 | 2.6 | 2.3 |
| 370.0 | 250.0 | 400.0 | 612.0 | 570.0 | 720.0 | 60.0 | 0.02 | 0.000198 |
| 370.0 | 250.0 | 400.0 | 612.0 | 570.0 | 720.0 | 60.0 | 0.048 | 0.000471 |